unstructured/test_unstructured/partition/pdf_image/test_pdfminer_processing.py

from unittest.mock import patch

import numpy as np
import pytest
from pdfminer.layout import LAParams
from PIL import Image
from unstructured_inference.constants import Source as InferenceSource
from unstructured_inference.inference.elements import (
    EmbeddedTextRegion,
    Rectangle,
    TextRegion,
    TextRegions,
)
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElements

from test_unstructured.unit_utils import example_doc_path
from unstructured.partition.auto import partition
from unstructured.partition.pdf_image.pdfminer_processing import (
    _validate_bbox,
    aggregate_embedded_text_by_block,
    bboxes1_is_almost_subregion_of_bboxes2,
    boxes_self_iou,
    clean_pdfminer_inner_elements,
    process_file_with_pdfminer,
    remove_duplicate_elements,
)
from unstructured.partition.utils.constants import Source

# A set of elements with pdfminer elements inside tables
deletable_elements_inside_table = [
    LayoutElement(
        bbox=Rectangle(0, 0, 100, 100),
        text="Table with inner elements",
        type="Table",
    ),
    LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=Source.PDFMINER),
    LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=Source.PDFMINER),
]

# A set of elements without pdfminer elements inside
# tables (no elements with source=Source.PDFMINER)
no_deletable_elements_inside_table = [
    LayoutElement(
        bbox=Rectangle(0, 0, 100, 100),
        text="Table with inner elements",
        type="Table",
        source=InferenceSource.YOLOX,
    ),
    LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=InferenceSource.YOLOX),
    LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=InferenceSource.YOLOX),
]
# A set of elements with pdfminer elements inside tables and other
# elements with source=Source.PDFMINER
# Note: there is some elements with source=Source.PDFMINER are not inside tables
mix_elements_inside_table = [
    LayoutElement(
        bbox=Rectangle(0, 0, 100, 100),
        text="Table1 with inner elements",
        type="Table",
        source=InferenceSource.YOLOX,
    ),
    LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="Inside table1"),
    LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="Inside table1", source=Source.PDFMINER),
    LayoutElement(
        bbox=Rectangle(150, 150, 170, 170),
        text="Outside tables",
        source=Source.PDFMINER,
    ),
    LayoutElement(
        bbox=Rectangle(180, 180, 200, 200),
        text="Outside tables",
        source=Source.PDFMINER,
    ),
    LayoutElement(
        bbox=Rectangle(0, 500, 100, 700),
        text="Table2 with inner elements",
        type="Table",
        source=InferenceSource.YOLOX,
    ),
    LayoutElement(bbox=Rectangle(0, 510, 50, 600), text="Inside table2", source=Source.PDFMINER),
    LayoutElement(bbox=Rectangle(0, 550, 70, 650), text="Inside table2", source=Source.PDFMINER),
]


@pytest.mark.parametrize(
    ("bbox", "is_valid"),
    [
        ([0, 1, 0, 1], False),
        ([0, 1, 1, 2], True),
        ([0, 1, 1, None], False),
        ([0, 1, 1, np.nan], False),
        ([0, 1, -1, 0], False),
        ([0, 1, -1, 2], False),
    ],
)
def test_valid_bbox(bbox, is_valid):
    assert _validate_bbox(bbox) is is_valid


@pytest.mark.parametrize(
    ("elements", "length_extra_info", "expected_document_length"),
    [
        (deletable_elements_inside_table, 1, 1),
        (no_deletable_elements_inside_table, 0, 3),
        (mix_elements_inside_table, 2, 5),
    ],
)
def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length):
    # create a sample document with pdfminer elements inside tables
    page = PageLayout(number=1, image=Image.new("1", (1, 1)))
    page.elements_array = LayoutElements.from_list(elements)
    document_with_table = DocumentLayout(pages=[page])
    document = document_with_table

    # call the function to clean the pdfminer inner elements
    cleaned_doc = clean_pdfminer_inner_elements(document)

    # check that the pdfminer elements were stored in the extra_info dictionary
    assert len(cleaned_doc.pages[0].elements_array) == expected_document_length


elements_with_duplicate_images = [
    LayoutElement(
        bbox=Rectangle(0, 0, 100, 100),
        text="Image1",
        type="Image",
        source=Source.PDFMINER,
    ),
    LayoutElement(
        bbox=Rectangle(10, 10, 110, 110), text="Image1", type="Image", source=Source.PDFMINER
    ),
    LayoutElement(bbox=Rectangle(150, 150, 170, 170), text="Title1", type="Title"),
]

elements_without_duplicate_images = [
    LayoutElement(
        bbox=Rectangle(0, 0, 100, 100),
        text="Sample image",
        type="Image",
        source=Source.PDFMINER,
    ),
    LayoutElement(
        bbox=Rectangle(10, 10, 110, 110),
        text="Sample image with similar bbox",
        type="Image",
        source=Source.PDFMINER,
    ),
    LayoutElement(
        bbox=Rectangle(200, 200, 250, 250),
        text="Sample image",
        type="Image",
        source=Source.PDFMINER,
    ),
    LayoutElement(bbox=Rectangle(150, 150, 170, 170), text="Title1", type="Title"),
]


def test_aggregate_by_block():
    expected = "Inside region1 Inside region2"
    embedded_regions = TextRegions.from_list(
        [
            TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
            TextRegion.from_coords(20, 20, 80, 80, None),
            TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
            TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
        ]
    )
    target_region = TextRegions.from_list([TextRegion.from_coords(0, 0, 300, 300)])

    text = aggregate_embedded_text_by_block(target_region, embedded_regions)
    assert text == expected


@pytest.mark.parametrize(
    ("coords1", "coords2", "expected"),
    [
        (
            [[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
            [[0, 0, 10, 10], [0, 0, 12, 12]],
            [[True, True], [False, False], [False, False]],
        ),
        (
            [[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
            [[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
            [[True, False, False], [False, False, False], [False, True, False]],
        ),
        (
            [[0, 0, 10, 10], [10, 10, 10, 10]],
            [[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
            [[True, False, False], [True, True, False]],
        ),
    ],
)
def test_bboxes1_is_almost_subregion_of_bboxes2(coords1, coords2, expected):
    bboxes1 = [Rectangle(*row) for row in coords1]
    bboxes2 = [Rectangle(*row) for row in coords2]
    np.testing.assert_array_equal(
        bboxes1_is_almost_subregion_of_bboxes2(bboxes1, bboxes2), expected
    )


@pytest.mark.parametrize(
    ("coords", "threshold", "expected"),
    [
        (
            [[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
            0.5,
            [[True, True, False], [True, True, False], [False, False, True]],
        ),
        (
            [[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
            0.9,
            [[True, False, False], [False, True, False], [False, False, True]],
        ),
        (
            [[0, 0, 10, 10], [10, 10, 10, 10]],
            0.5,
            [[True, False], [False, True]],
        ),
    ],
)
def test_boxes_self_iou(coords, threshold, expected):
    bboxes = [Rectangle(*row) for row in coords]
    np.testing.assert_array_equal(boxes_self_iou(bboxes, threshold), expected)


def test_remove_duplicate_elements():
    sample_elements = TextRegions.from_list(
        [
            EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 1"),
            EmbeddedTextRegion(bbox=Rectangle(0, 0, 10, 10), text="Text 2"),
            EmbeddedTextRegion(bbox=Rectangle(20, 20, 30, 30), text="Text 3"),
        ]
    )

    result = remove_duplicate_elements(sample_elements)

    # Check that duplicates were removed and only 2 unique elements remain
    assert len(result) == 2
    assert result.texts.tolist() == ["Text 2", "Text 3"]
    assert result.element_coords.tolist() == [[0, 0, 10, 10], [20, 20, 30, 30]]


def test_process_file_with_pdfminer():
    layout, links = process_file_with_pdfminer(example_doc_path("pdf/layout-parser-paper-fast.pdf"))
    assert len(layout)
    assert "LayoutParser: A Uniﬁed Toolkit for Deep\n" in layout[0].texts
    assert links[0][0]["url"] == "https://layout-parser.github.io"


@patch("unstructured.partition.pdf_image.pdfminer_utils.LAParams", return_value=LAParams())
def test_laprams_are_passed_from_partition_to_pdfminer(pdfminer_mock):
    partition(
        filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
        pdfminer_line_margin=1.123,
        pdfminer_char_margin=None,
        pdfminer_line_overlap=0.0123,
        pdfminer_word_margin=3.21,
    )
    assert pdfminer_mock.call_args.kwargs == {
        "line_margin": 1.123,
        "line_overlap": 0.0123,
        "word_margin": 3.21,
    }