mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-06 15:53:31 +00:00

This pull request adds the ability to configure multiple pdfminer parameters (with the simple possibility to extend for the additional parameters). One of the parameters overwrites the default from LA Params config class. Example: ```python3 partition( filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), pdfminer_line_margin=1.123, pdfminer_char_margin=None, pdfminer_line_overlap=0.0123, pdfminer_word_margin=3.21, ) assert pdfminer_mock.call_args.kwargs == { "line_margin": 1.123, "line_overlap": 0.0123, "word_margin": 3.21, } ``` --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: plutasnyy <plutasnyy@users.noreply.github.com>
265 lines
8.8 KiB
Python
265 lines
8.8 KiB
Python
from unittest.mock import patch
|
|
|
|
import numpy as np
|
|
import pytest
|
|
from pdfminer.layout import LAParams
|
|
from PIL import Image
|
|
from unstructured_inference.constants import Source as InferenceSource
|
|
from unstructured_inference.inference.elements import (
|
|
EmbeddedTextRegion,
|
|
Rectangle,
|
|
TextRegion,
|
|
TextRegions,
|
|
)
|
|
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
|
|
|
|
from test_unstructured.unit_utils import example_doc_path
|
|
from unstructured.partition.auto import partition
|
|
from unstructured.partition.pdf_image.pdfminer_processing import (
|
|
_validate_bbox,
|
|
aggregate_embedded_text_by_block,
|
|
bboxes1_is_almost_subregion_of_bboxes2,
|
|
boxes_self_iou,
|
|
clean_pdfminer_inner_elements,
|
|
process_file_with_pdfminer,
|
|
remove_duplicate_elements,
|
|
)
|
|
from unstructured.partition.utils.constants import Source
|
|
|
|
# A set of elements with pdfminer elements inside tables
|
|
deletable_elements_inside_table = [
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 0, 100, 100),
|
|
text="Table with inner elements",
|
|
type="Table",
|
|
),
|
|
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=Source.PDFMINER),
|
|
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=Source.PDFMINER),
|
|
]
|
|
|
|
# A set of elements without pdfminer elements inside
|
|
# tables (no elements with source=Source.PDFMINER)
|
|
no_deletable_elements_inside_table = [
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 0, 100, 100),
|
|
text="Table with inner elements",
|
|
type="Table",
|
|
source=InferenceSource.YOLOX,
|
|
),
|
|
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=InferenceSource.YOLOX),
|
|
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=InferenceSource.YOLOX),
|
|
]
|
|
# A set of elements with pdfminer elements inside tables and other
|
|
# elements with source=Source.PDFMINER
|
|
# Note: there is some elements with source=Source.PDFMINER are not inside tables
|
|
mix_elements_inside_table = [
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 0, 100, 100),
|
|
text="Table1 with inner elements",
|
|
type="Table",
|
|
source=InferenceSource.YOLOX,
|
|
),
|
|
LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="Inside table1"),
|
|
LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="Inside table1", source=Source.PDFMINER),
|
|
LayoutElement(
|
|
bbox=Rectangle(150, 150, 170, 170),
|
|
text="Outside tables",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(
|
|
bbox=Rectangle(180, 180, 200, 200),
|
|
text="Outside tables",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 500, 100, 700),
|
|
text="Table2 with inner elements",
|
|
type="Table",
|
|
source=InferenceSource.YOLOX,
|
|
),
|
|
LayoutElement(bbox=Rectangle(0, 510, 50, 600), text="Inside table2", source=Source.PDFMINER),
|
|
LayoutElement(bbox=Rectangle(0, 550, 70, 650), text="Inside table2", source=Source.PDFMINER),
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("bbox", "is_valid"),
|
|
[
|
|
([0, 1, 0, 1], False),
|
|
([0, 1, 1, 2], True),
|
|
([0, 1, 1, None], False),
|
|
([0, 1, 1, np.nan], False),
|
|
([0, 1, -1, 0], False),
|
|
([0, 1, -1, 2], False),
|
|
],
|
|
)
|
|
def test_valid_bbox(bbox, is_valid):
|
|
assert _validate_bbox(bbox) is is_valid
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("elements", "length_extra_info", "expected_document_length"),
|
|
[
|
|
(deletable_elements_inside_table, 1, 1),
|
|
(no_deletable_elements_inside_table, 0, 3),
|
|
(mix_elements_inside_table, 2, 5),
|
|
],
|
|
)
|
|
def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length):
|
|
# create a sample document with pdfminer elements inside tables
|
|
page = PageLayout(number=1, image=Image.new("1", (1, 1)))
|
|
page.elements = elements
|
|
document_with_table = DocumentLayout(pages=[page])
|
|
document = document_with_table
|
|
|
|
# call the function to clean the pdfminer inner elements
|
|
cleaned_doc = clean_pdfminer_inner_elements(document)
|
|
|
|
# check that the pdfminer elements were stored in the extra_info dictionary
|
|
assert len(cleaned_doc.pages[0].elements) == expected_document_length
|
|
|
|
|
|
elements_with_duplicate_images = [
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 0, 100, 100),
|
|
text="Image1",
|
|
type="Image",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(
|
|
bbox=Rectangle(10, 10, 110, 110), text="Image1", type="Image", source=Source.PDFMINER
|
|
),
|
|
LayoutElement(bbox=Rectangle(150, 150, 170, 170), text="Title1", type="Title"),
|
|
]
|
|
|
|
elements_without_duplicate_images = [
|
|
LayoutElement(
|
|
bbox=Rectangle(0, 0, 100, 100),
|
|
text="Sample image",
|
|
type="Image",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(
|
|
bbox=Rectangle(10, 10, 110, 110),
|
|
text="Sample image with similar bbox",
|
|
type="Image",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(
|
|
bbox=Rectangle(200, 200, 250, 250),
|
|
text="Sample image",
|
|
type="Image",
|
|
source=Source.PDFMINER,
|
|
),
|
|
LayoutElement(bbox=Rectangle(150, 150, 170, 170), text="Title1", type="Title"),
|
|
]
|
|
|
|
|
|
def test_aggregate_by_block():
|
|
expected = "Inside region1 Inside region2"
|
|
embedded_regions = TextRegions.from_list(
|
|
[
|
|
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
|
|
TextRegion.from_coords(20, 20, 80, 80, None),
|
|
TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
|
|
TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
|
|
]
|
|
)
|
|
target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])
|
|
|
|
text = aggregate_embedded_text_by_block(target_region, embedded_regions)
|
|
assert text == expected
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("coords1", "coords2", "expected"),
|
|
[
|
|
(
|
|
[[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
|
|
[[0, 0, 10, 10], [0, 0, 12, 12]],
|
|
[[True, True], [False, False], [False, False]],
|
|
),
|
|
(
|
|
[[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
|
|
[[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
|
|
[[True, False, False], [False, False, False], [False, True, False]],
|
|
),
|
|
(
|
|
[[0, 0, 10, 10], [10, 10, 10, 10]],
|
|
[[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
|
|
[[True, False, False], [True, True, False]],
|
|
),
|
|
],
|
|
)
|
|
def test_bboxes1_is_almost_subregion_of_bboxes2(coords1, coords2, expected):
|
|
bboxes1 = [Rectangle(*row) for row in coords1]
|
|
bboxes2 = [Rectangle(*row) for row in coords2]
|
|
np.testing.assert_array_equal(
|
|
bboxes1_is_almost_subregion_of_bboxes2(bboxes1, bboxes2), expected
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("coords", "threshold", "expected"),
|
|
[
|
|
(
|
|
[[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
|
|
0.5,
|
|
[[True, True, False], [True, True, False], [False, False, True]],
|
|
),
|
|
(
|
|
[[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
|
|
0.9,
|
|
[[True, False, False], [False, True, False], [False, False, True]],
|
|
),
|
|
(
|
|
[[0, 0, 10, 10], [10, 10, 10, 10]],
|
|
0.5,
|
|
[[True, False], [False, True]],
|
|
),
|
|
],
|
|
)
|
|
def test_boxes_self_iou(coords, threshold, expected):
|
|
bboxes = [Rectangle(*row) for row in coords]
|
|
np.testing.assert_array_equal(boxes_self_iou(bboxes, threshold), expected)
|
|
|
|
|
|
def test_remove_duplicate_elements():
|
|
sample_elements = TextRegions.from_list(
|
|
[
|
|
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 1"),
|
|
EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 2"),
|
|
EmbeddedTextRegion(bbox=Rectangle(20, 20, 30, 30), text="Text 3"),
|
|
]
|
|
)
|
|
|
|
result = remove_duplicate_elements(sample_elements)
|
|
|
|
# Check that duplicates were removed and only 2 unique elements remain
|
|
assert len(result) == 2
|
|
assert result.texts.tolist() == ["Text 2", "Text 3"]
|
|
assert result.element_coords.tolist() == [[0, 0, 10, 10], [20, 20, 30, 30]]
|
|
|
|
|
|
def test_process_file_with_pdfminer():
|
|
layout, links = process_file_with_pdfminer(example_doc_path("pdf/layout-parser-paper-fast.pdf"))
|
|
assert len(layout)
|
|
assert "LayoutParser: A Unified Toolkit for Deep\n" in layout[0].texts
|
|
assert links[0][0]["url"] == "https://layout-parser.github.io"
|
|
|
|
|
|
@patch("unstructured.partition.pdf_image.pdfminer_utils.LAParams", return_value=LAParams())
|
|
def test_laprams_are_passed_from_partition_to_pdfminer(pdfminer_mock):
|
|
partition(
|
|
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
|
pdfminer_line_margin=1.123,
|
|
pdfminer_char_margin=None,
|
|
pdfminer_line_overlap=0.0123,
|
|
pdfminer_word_margin=3.21,
|
|
)
|
|
assert pdfminer_mock.call_args.kwargs == {
|
|
"line_margin": 1.123,
|
|
"line_overlap": 0.0123,
|
|
"word_margin": 3.21,
|
|
}
|