mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-31 12:55:35 +00:00

This PR removes usage of `PageLayout.elements` from partition function, except for when `analysis=True`. This PR updates the partition logic so that `PageLayout.elements_array` is used everywhere to save memory and cpu cost. Since the analysis function is intended for investigation and not for general document processing purposes, this part of the code is left for a future refactor. `PageLayout.elements` uses a list to store layout elements' data while `elements_array` uses `numpy` array to store the data, which has much lower memory requirements. Using `memory_profiler` to test the differences is usually around 10x.
266 lines
8.9 KiB
Python
266 lines
8.9 KiB
Python
from unittest.mock import patch
|
|
|
|
import numpy as np
|
|
import pytest
|
|
from pdfminer.layout import LAParams
|
|
from PIL import Image
|
|
from unstructured_inference.constants import Source as InferenceSource
|
|
from unstructured_inference.inference.elements import (
|
|
EmbeddedTextRegion,
|
|
Rectangle,
|
|
TextRegion,
|
|
TextRegions,
|
|
)
|
|
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
|
|
from unstructured_inference.inference.layoutelement import LayoutElements
|
|
|
|
from test_unstructured.unit_utils import example_doc_path
|
|
from unstructured.partition.auto import partition
|
|
from unstructured.partition.pdf_image.pdfminer_processing import (
|
|
_validate_bbox,
|
|
aggregate_embedded_text_by_block,
|
|
bboxes1_is_almost_subregion_of_bboxes2,
|
|
boxes_self_iou,
|
|
clean_pdfminer_inner_elements,
|
|
process_file_with_pdfminer,
|
|
remove_duplicate_elements,
|
|
)
|
|
from unstructured.partition.utils.constants import Source
|
|
|
|
# A set of elements with pdfminer elements inside tables
|
|
deletable_elements_inside_table = [
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 0, 100, 100),
|
|
text="Table with inner elements",
|
|
type="Table",
|
|
),
|
|
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=Source.PDFMINER),
|
|
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=Source.PDFMINER),
|
|
]
|
|
|
|
# A set of elements without pdfminer elements inside
|
|
# tables (no elements with source=Source.PDFMINER)
|
|
no_deletable_elements_inside_table = [
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 0, 100, 100),
|
|
text="Table with inner elements",
|
|
type="Table",
|
|
source=InferenceSource.YOLOX,
|
|
),
|
|
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=InferenceSource.YOLOX),
|
|
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=InferenceSource.YOLOX),
|
|
]
|
|
# A set of elements with pdfminer elements inside tables and other
|
|
# elements with source=Source.PDFMINER
|
|
# Note: there is some elements with source=Source.PDFMINER are not inside tables
|
|
mix_elements_inside_table = [
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 0, 100, 100),
|
|
text="Table1 with inner elements",
|
|
type="Table",
|
|
source=InferenceSource.YOLOX,
|
|
),
|
|
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="Inside table1"),
|
|
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="Inside table1", source=Source.PDFMINER),
|
|
LayoutElement(
|
|
bbox=Rectangle(150, 150, 170, 170),
|
|
text="Outside tables",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(
|
|
bbox=Rectangle(180, 180, 200, 200),
|
|
text="Outside tables",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 500, 100, 700),
|
|
text="Table2 with inner elements",
|
|
type="Table",
|
|
source=InferenceSource.YOLOX,
|
|
),
|
|
LayoutElement(bbox=Rectangle(0, 510, 50, 600), text="Inside table2", source=Source.PDFMINER),
|
|
LayoutElement(bbox=Rectangle(0, 550, 70, 650), text="Inside table2", source=Source.PDFMINER),
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("bbox", "is_valid"),
|
|
[
|
|
([0, 1, 0, 1], False),
|
|
([0, 1, 1, 2], True),
|
|
([0, 1, 1, None], False),
|
|
([0, 1, 1, np.nan], False),
|
|
([0, 1, -1, 0], False),
|
|
([0, 1, -1, 2], False),
|
|
],
|
|
)
|
|
def test_valid_bbox(bbox, is_valid):
|
|
assert _validate_bbox(bbox) is is_valid
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("elements", "length_extra_info", "expected_document_length"),
|
|
[
|
|
(deletable_elements_inside_table, 1, 1),
|
|
(no_deletable_elements_inside_table, 0, 3),
|
|
(mix_elements_inside_table, 2, 5),
|
|
],
|
|
)
|
|
def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length):
|
|
# create a sample document with pdfminer elements inside tables
|
|
page = PageLayout(number=1, image=Image.new("1", (1, 1)))
|
|
page.elements_array = LayoutElements.from_list(elements)
|
|
document_with_table = DocumentLayout(pages=[page])
|
|
document = document_with_table
|
|
|
|
# call the function to clean the pdfminer inner elements
|
|
cleaned_doc = clean_pdfminer_inner_elements(document)
|
|
|
|
# check that the pdfminer elements were stored in the extra_info dictionary
|
|
assert len(cleaned_doc.pages[0].elements_array) == expected_document_length
|
|
|
|
|
|
elements_with_duplicate_images = [
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 0, 100, 100),
|
|
text="Image1",
|
|
type="Image",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(
|
|
bbox=Rectangle(10, 10, 110, 110), text="Image1", type="Image", source=Source.PDFMINER
|
|
),
|
|
LayoutElement(bbox=Rectangle(150, 150, 170, 170), text="Title1", type="Title"),
|
|
]
|
|
|
|
elements_without_duplicate_images = [
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 0, 100, 100),
|
|
text="Sample image",
|
|
type="Image",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(
|
|
bbox=Rectangle(10, 10, 110, 110),
|
|
text="Sample image with similar bbox",
|
|
type="Image",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(
|
|
bbox=Rectangle(200, 200, 250, 250),
|
|
text="Sample image",
|
|
type="Image",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(bbox=Rectangle(150, 150, 170, 170), text="Title1", type="Title"),
|
|
]
|
|
|
|
|
|
def test_aggregate_by_block():
|
|
expected = "Inside region1 Inside region2"
|
|
embedded_regions = TextRegions.from_list(
|
|
[
|
|
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
|
|
TextRegion.from_coords(20, 20, 80, 80, None),
|
|
TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
|
|
TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
|
|
]
|
|
)
|
|
target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])
|
|
|
|
text = aggregate_embedded_text_by_block(target_region, embedded_regions)
|
|
assert text == expected
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("coords1", "coords2", "expected"),
|
|
[
|
|
(
|
|
[[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
|
|
[[0, 0, 10, 10], [0, 0, 12, 12]],
|
|
[[True, True], [False, False], [False, False]],
|
|
),
|
|
(
|
|
[[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
|
|
[[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
|
|
[[True, False, False], [False, False, False], [False, True, False]],
|
|
),
|
|
(
|
|
[[0, 0, 10, 10], [10, 10, 10, 10]],
|
|
[[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
|
|
[[True, False, False], [True, True, False]],
|
|
),
|
|
],
|
|
)
|
|
def test_bboxes1_is_almost_subregion_of_bboxes2(coords1, coords2, expected):
|
|
bboxes1 = [Rectangle(*row) for row in coords1]
|
|
bboxes2 = [Rectangle(*row) for row in coords2]
|
|
np.testing.assert_array_equal(
|
|
bboxes1_is_almost_subregion_of_bboxes2(bboxes1, bboxes2), expected
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("coords", "threshold", "expected"),
|
|
[
|
|
(
|
|
[[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
|
|
0.5,
|
|
[[True, True, False], [True, True, False], [False, False, True]],
|
|
),
|
|
(
|
|
[[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
|
|
0.9,
|
|
[[True, False, False], [False, True, False], [False, False, True]],
|
|
),
|
|
(
|
|
[[0, 0, 10, 10], [10, 10, 10, 10]],
|
|
0.5,
|
|
[[True, False], [False, True]],
|
|
),
|
|
],
|
|
)
|
|
def test_boxes_self_iou(coords, threshold, expected):
|
|
bboxes = [Rectangle(*row) for row in coords]
|
|
np.testing.assert_array_equal(boxes_self_iou(bboxes, threshold), expected)
|
|
|
|
|
|
def test_remove_duplicate_elements():
|
|
sample_elements = TextRegions.from_list(
|
|
[
|
|
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 1"),
|
|
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 2"),
|
|
EmbeddedTextRegion(bbox=Rectangle(20, 20, 30, 30), text="Text 3"),
|
|
]
|
|
)
|
|
|
|
result = remove_duplicate_elements(sample_elements)
|
|
|
|
# Check that duplicates were removed and only 2 unique elements remain
|
|
assert len(result) == 2
|
|
assert result.texts.tolist() == ["Text 2", "Text 3"]
|
|
assert result.element_coords.tolist() == [[0, 0, 10, 10], [20, 20, 30, 30]]
|
|
|
|
|
|
def test_process_file_with_pdfminer():
|
|
layout, links = process_file_with_pdfminer(example_doc_path("pdf/layout-parser-paper-fast.pdf"))
|
|
assert len(layout)
|
|
assert "LayoutParser: A Unified Toolkit for Deep\n" in layout[0].texts
|
|
assert links[0][0]["url"] == "https://layout-parser.github.io"
|
|
|
|
|
|
@patch("unstructured.partition.pdf_image.pdfminer_utils.LAParams", return_value=LAParams())
|
|
def test_laprams_are_passed_from_partition_to_pdfminer(pdfminer_mock):
|
|
partition(
|
|
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
|
pdfminer_line_margin=1.123,
|
|
pdfminer_char_margin=None,
|
|
pdfminer_line_overlap=0.0123,
|
|
pdfminer_word_margin=3.21,
|
|
)
|
|
assert pdfminer_mock.call_args.kwargs == {
|
|
"line_margin": 1.123,
|
|
"line_overlap": 0.0123,
|
|
"word_margin": 3.21,
|
|
}
|