unstructured/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
Yao You 2dceac34b5
Feat/remove reference of PageLayout.elements (#3943)
This PR removes usage of `PageLayout.elements` from partition function,
except for when `analysis=True`. This PR updates the partition logic so
that `PageLayout.elements_array` is used everywhere to save memory and
cpu cost.
Since the analysis function is intended for investigation and not for
general document processing purposes, this part of the code is left for
a future refactor.

`PageLayout.elements` uses a list to store layout elements' data while
`elements_array` uses `numpy` array to store the data, which has much
lower memory requirements. Using `memory_profiler` to test the
differences is usually around 10x.
2025-03-12 15:21:21 +00:00

266 lines
8.9 KiB
Python

from unittest.mock import patch
import numpy as np
import pytest
from pdfminer.layout import LAParams
from PIL import Image
from unstructured_inference.constants import Source as InferenceSource
from unstructured_inference.inference.elements import (
EmbeddedTextRegion,
Rectangle,
TextRegion,
TextRegions,
)
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElements
from test_unstructured.unit_utils import example_doc_path
from unstructured.partition.auto import partition
from unstructured.partition.pdf_image.pdfminer_processing import (
_validate_bbox,
aggregate_embedded_text_by_block,
bboxes1_is_almost_subregion_of_bboxes2,
boxes_self_iou,
clean_pdfminer_inner_elements,
process_file_with_pdfminer,
remove_duplicate_elements,
)
from unstructured.partition.utils.constants import Source
# A set of elements with pdfminer elements inside tables
deletable_elements_inside_table = [
LayoutElement(
bbox=Rectangle(0, 0, 100, 100),
text="Table with inner elements",
type="Table",
),
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=Source.PDFMINER),
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=Source.PDFMINER),
]
# A set of elements without pdfminer elements inside
# tables (no elements with source=Source.PDFMINER)
no_deletable_elements_inside_table = [
LayoutElement(
bbox=Rectangle(0, 0, 100, 100),
text="Table with inner elements",
type="Table",
source=InferenceSource.YOLOX,
),
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=InferenceSource.YOLOX),
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=InferenceSource.YOLOX),
]
# A set of elements with pdfminer elements inside tables and other
# elements with source=Source.PDFMINER
# Note: there is some elements with source=Source.PDFMINER are not inside tables
mix_elements_inside_table = [
LayoutElement(
bbox=Rectangle(0, 0, 100, 100),
text="Table1 with inner elements",
type="Table",
source=InferenceSource.YOLOX,
),
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="Inside table1"),
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="Inside table1", source=Source.PDFMINER),
LayoutElement(
bbox=Rectangle(150, 150, 170, 170),
text="Outside tables",
source=Source.PDFMINER,
),
LayoutElement(
bbox=Rectangle(180, 180, 200, 200),
text="Outside tables",
source=Source.PDFMINER,
),
LayoutElement(
bbox=Rectangle(0, 500, 100, 700),
text="Table2 with inner elements",
type="Table",
source=InferenceSource.YOLOX,
),
LayoutElement(bbox=Rectangle(0, 510, 50, 600), text="Inside table2", source=Source.PDFMINER),
LayoutElement(bbox=Rectangle(0, 550, 70, 650), text="Inside table2", source=Source.PDFMINER),
]
@pytest.mark.parametrize(
("bbox", "is_valid"),
[
([0, 1, 0, 1], False),
([0, 1, 1, 2], True),
([0, 1, 1, None], False),
([0, 1, 1, np.nan], False),
([0, 1, -1, 0], False),
([0, 1, -1, 2], False),
],
)
def test_valid_bbox(bbox, is_valid):
assert _validate_bbox(bbox) is is_valid
@pytest.mark.parametrize(
("elements", "length_extra_info", "expected_document_length"),
[
(deletable_elements_inside_table, 1, 1),
(no_deletable_elements_inside_table, 0, 3),
(mix_elements_inside_table, 2, 5),
],
)
def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length):
# create a sample document with pdfminer elements inside tables
page = PageLayout(number=1, image=Image.new("1", (1, 1)))
page.elements_array = LayoutElements.from_list(elements)
document_with_table = DocumentLayout(pages=[page])
document = document_with_table
# call the function to clean the pdfminer inner elements
cleaned_doc = clean_pdfminer_inner_elements(document)
# check that the pdfminer elements were stored in the extra_info dictionary
assert len(cleaned_doc.pages[0].elements_array) == expected_document_length
elements_with_duplicate_images = [
LayoutElement(
bbox=Rectangle(0, 0, 100, 100),
text="Image1",
type="Image",
source=Source.PDFMINER,
),
LayoutElement(
bbox=Rectangle(10, 10, 110, 110), text="Image1", type="Image", source=Source.PDFMINER
),
LayoutElement(bbox=Rectangle(150, 150, 170, 170), text="Title1", type="Title"),
]
elements_without_duplicate_images = [
LayoutElement(
bbox=Rectangle(0, 0, 100, 100),
text="Sample image",
type="Image",
source=Source.PDFMINER,
),
LayoutElement(
bbox=Rectangle(10, 10, 110, 110),
text="Sample image with similar bbox",
type="Image",
source=Source.PDFMINER,
),
LayoutElement(
bbox=Rectangle(200, 200, 250, 250),
text="Sample image",
type="Image",
source=Source.PDFMINER,
),
LayoutElement(bbox=Rectangle(150, 150, 170, 170), text="Title1", type="Title"),
]
def test_aggregate_by_block():
expected = "Inside region1 Inside region2"
embedded_regions = TextRegions.from_list(
[
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
TextRegion.from_coords(20, 20, 80, 80, None),
TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
]
)
target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])
text = aggregate_embedded_text_by_block(target_region, embedded_regions)
assert text == expected
@pytest.mark.parametrize(
("coords1", "coords2", "expected"),
[
(
[[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
[[0, 0, 10, 10], [0, 0, 12, 12]],
[[True, True], [False, False], [False, False]],
),
(
[[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
[[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
[[True, False, False], [False, False, False], [False, True, False]],
),
(
[[0, 0, 10, 10], [10, 10, 10, 10]],
[[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
[[True, False, False], [True, True, False]],
),
],
)
def test_bboxes1_is_almost_subregion_of_bboxes2(coords1, coords2, expected):
bboxes1 = [Rectangle(*row) for row in coords1]
bboxes2 = [Rectangle(*row) for row in coords2]
np.testing.assert_array_equal(
bboxes1_is_almost_subregion_of_bboxes2(bboxes1, bboxes2), expected
)
@pytest.mark.parametrize(
("coords", "threshold", "expected"),
[
(
[[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
0.5,
[[True, True, False], [True, True, False], [False, False, True]],
),
(
[[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
0.9,
[[True, False, False], [False, True, False], [False, False, True]],
),
(
[[0, 0, 10, 10], [10, 10, 10, 10]],
0.5,
[[True, False], [False, True]],
),
],
)
def test_boxes_self_iou(coords, threshold, expected):
bboxes = [Rectangle(*row) for row in coords]
np.testing.assert_array_equal(boxes_self_iou(bboxes, threshold), expected)
def test_remove_duplicate_elements():
sample_elements = TextRegions.from_list(
[
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 1"),
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 2"),
EmbeddedTextRegion(bbox=Rectangle(20, 20, 30, 30), text="Text 3"),
]
)
result = remove_duplicate_elements(sample_elements)
# Check that duplicates were removed and only 2 unique elements remain
assert len(result) == 2
assert result.texts.tolist() == ["Text 2", "Text 3"]
assert result.element_coords.tolist() == [[0, 0, 10, 10], [20, 20, 30, 30]]
def test_process_file_with_pdfminer():
layout, links = process_file_with_pdfminer(example_doc_path("pdf/layout-parser-paper-fast.pdf"))
assert len(layout)
assert "LayoutParser: A Unified Toolkit for Deep\n" in layout[0].texts
assert links[0][0]["url"] == "https://layout-parser.github.io"
@patch("unstructured.partition.pdf_image.pdfminer_utils.LAParams", return_value=LAParams())
def test_laprams_are_passed_from_partition_to_pdfminer(pdfminer_mock):
partition(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
pdfminer_line_margin=1.123,
pdfminer_char_margin=None,
pdfminer_line_overlap=0.0123,
pdfminer_word_margin=3.21,
)
assert pdfminer_mock.call_args.kwargs == {
"line_margin": 1.123,
"line_overlap": 0.0123,
"word_margin": 3.21,
}