unstructured/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
Pluto 3973a30b8c
Feat: Add pdfminer parameters configuration (#3918)
This pull request adds the ability to configure multiple pdfminer
parameters (with the simple possibility to extend for the additional
parameters). One of the parameters overwrites the default from LA Params
config class.

Example:
```python3
partition(
    filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
    pdfminer_line_margin=1.123,
    pdfminer_char_margin=None,
    pdfminer_line_overlap=0.0123,
    pdfminer_word_margin=3.21,
)
assert pdfminer_mock.call_args.kwargs == {
    "line_margin": 1.123,
    "line_overlap": 0.0123,
    "word_margin": 3.21,
}
```

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: plutasnyy <plutasnyy@users.noreply.github.com>
2025-02-17 11:41:20 +00:00

265 lines
8.8 KiB
Python

from unittest.mock import patch
import numpy as np
import pytest
from pdfminer.layout import LAParams
from PIL import Image
from unstructured_inference.constants import Source as InferenceSource
from unstructured_inference.inference.elements import (
EmbeddedTextRegion,
Rectangle,
TextRegion,
TextRegions,
)
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
from test_unstructured.unit_utils import example_doc_path
from unstructured.partition.auto import partition
from unstructured.partition.pdf_image.pdfminer_processing import (
_validate_bbox,
aggregate_embedded_text_by_block,
bboxes1_is_almost_subregion_of_bboxes2,
boxes_self_iou,
clean_pdfminer_inner_elements,
process_file_with_pdfminer,
remove_duplicate_elements,
)
from unstructured.partition.utils.constants import Source
# A set of elements with pdfminer elements inside tables
deletable_elements_inside_table = [
LayoutElement(
bbox=Rectangle(0, 0, 100, 100),
text="Table with inner elements",
type="Table",
),
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=Source.PDFMINER),
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=Source.PDFMINER),
]
# A set of elements without pdfminer elements inside
# tables (no elements with source=Source.PDFMINER)
no_deletable_elements_inside_table = [
LayoutElement(
bbox=Rectangle(0, 0, 100, 100),
text="Table with inner elements",
type="Table",
source=InferenceSource.YOLOX,
),
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=InferenceSource.YOLOX),
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=InferenceSource.YOLOX),
]
# A set of elements with pdfminer elements inside tables and other
# elements with source=Source.PDFMINER
# Note: there is some elements with source=Source.PDFMINER are not inside tables
mix_elements_inside_table = [
LayoutElement(
bbox=Rectangle(0, 0, 100, 100),
text="Table1 with inner elements",
type="Table",
source=InferenceSource.YOLOX,
),
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="Inside table1"),
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="Inside table1", source=Source.PDFMINER),
LayoutElement(
bbox=Rectangle(150, 150, 170, 170),
text="Outside tables",
source=Source.PDFMINER,
),
LayoutElement(
bbox=Rectangle(180, 180, 200, 200),
text="Outside tables",
source=Source.PDFMINER,
),
LayoutElement(
bbox=Rectangle(0, 500, 100, 700),
text="Table2 with inner elements",
type="Table",
source=InferenceSource.YOLOX,
),
LayoutElement(bbox=Rectangle(0, 510, 50, 600), text="Inside table2", source=Source.PDFMINER),
LayoutElement(bbox=Rectangle(0, 550, 70, 650), text="Inside table2", source=Source.PDFMINER),
]
@pytest.mark.parametrize(
("bbox", "is_valid"),
[
([0, 1, 0, 1], False),
([0, 1, 1, 2], True),
([0, 1, 1, None], False),
([0, 1, 1, np.nan], False),
([0, 1, -1, 0], False),
([0, 1, -1, 2], False),
],
)
def test_valid_bbox(bbox, is_valid):
assert _validate_bbox(bbox) is is_valid
@pytest.mark.parametrize(
("elements", "length_extra_info", "expected_document_length"),
[
(deletable_elements_inside_table, 1, 1),
(no_deletable_elements_inside_table, 0, 3),
(mix_elements_inside_table, 2, 5),
],
)
def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length):
# create a sample document with pdfminer elements inside tables
page = PageLayout(number=1, image=Image.new("1", (1, 1)))
page.elements = elements
document_with_table = DocumentLayout(pages=[page])
document = document_with_table
# call the function to clean the pdfminer inner elements
cleaned_doc = clean_pdfminer_inner_elements(document)
# check that the pdfminer elements were stored in the extra_info dictionary
assert len(cleaned_doc.pages[0].elements) == expected_document_length
elements_with_duplicate_images = [
LayoutElement(
bbox=Rectangle(0, 0, 100, 100),
text="Image1",
type="Image",
source=Source.PDFMINER,
),
LayoutElement(
bbox=Rectangle(10, 10, 110, 110), text="Image1", type="Image", source=Source.PDFMINER
),
LayoutElement(bbox=Rectangle(150, 150, 170, 170), text="Title1", type="Title"),
]
elements_without_duplicate_images = [
LayoutElement(
bbox=Rectangle(0, 0, 100, 100),
text="Sample image",
type="Image",
source=Source.PDFMINER,
),
LayoutElement(
bbox=Rectangle(10, 10, 110, 110),
text="Sample image with similar bbox",
type="Image",
source=Source.PDFMINER,
),
LayoutElement(
bbox=Rectangle(200, 200, 250, 250),
text="Sample image",
type="Image",
source=Source.PDFMINER,
),
LayoutElement(bbox=Rectangle(150, 150, 170, 170), text="Title1", type="Title"),
]
def test_aggregate_by_block():
expected = "Inside region1 Inside region2"
embedded_regions = TextRegions.from_list(
[
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
TextRegion.from_coords(20, 20, 80, 80, None),
TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
]
)
target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])
text = aggregate_embedded_text_by_block(target_region, embedded_regions)
assert text == expected
@pytest.mark.parametrize(
("coords1", "coords2", "expected"),
[
(
[[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
[[0, 0, 10, 10], [0, 0, 12, 12]],
[[True, True], [False, False], [False, False]],
),
(
[[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
[[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
[[True, False, False], [False, False, False], [False, True, False]],
),
(
[[0, 0, 10, 10], [10, 10, 10, 10]],
[[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
[[True, False, False], [True, True, False]],
),
],
)
def test_bboxes1_is_almost_subregion_of_bboxes2(coords1, coords2, expected):
bboxes1 = [Rectangle(*row) for row in coords1]
bboxes2 = [Rectangle(*row) for row in coords2]
np.testing.assert_array_equal(
bboxes1_is_almost_subregion_of_bboxes2(bboxes1, bboxes2), expected
)
@pytest.mark.parametrize(
("coords", "threshold", "expected"),
[
(
[[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
0.5,
[[True, True, False], [True, True, False], [False, False, True]],
),
(
[[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
0.9,
[[True, False, False], [False, True, False], [False, False, True]],
),
(
[[0, 0, 10, 10], [10, 10, 10, 10]],
0.5,
[[True, False], [False, True]],
),
],
)
def test_boxes_self_iou(coords, threshold, expected):
bboxes = [Rectangle(*row) for row in coords]
np.testing.assert_array_equal(boxes_self_iou(bboxes, threshold), expected)
def test_remove_duplicate_elements():
sample_elements = TextRegions.from_list(
[
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 1"),
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 2"),
EmbeddedTextRegion(bbox=Rectangle(20, 20, 30, 30), text="Text 3"),
]
)
result = remove_duplicate_elements(sample_elements)
# Check that duplicates were removed and only 2 unique elements remain
assert len(result) == 2
assert result.texts.tolist() == ["Text 2", "Text 3"]
assert result.element_coords.tolist() == [[0, 0, 10, 10], [20, 20, 30, 30]]
def test_process_file_with_pdfminer():
layout, links = process_file_with_pdfminer(example_doc_path("pdf/layout-parser-paper-fast.pdf"))
assert len(layout)
assert "LayoutParser: A Unified Toolkit for Deep\n" in layout[0].texts
assert links[0][0]["url"] == "https://layout-parser.github.io"
@patch("unstructured.partition.pdf_image.pdfminer_utils.LAParams", return_value=LAParams())
def test_laprams_are_passed_from_partition_to_pdfminer(pdfminer_mock):
partition(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
pdfminer_line_margin=1.123,
pdfminer_char_margin=None,
pdfminer_line_overlap=0.0123,
pdfminer_word_margin=3.21,
)
assert pdfminer_mock.call_args.kwargs == {
"line_margin": 1.123,
"line_overlap": 0.0123,
"word_margin": 3.21,
}