unstructured/test_unstructured/partition/pdf_image/test_inference_utils.py

from unstructured_inference.inference.elements import TextRegion, TextRegions
from unstructured_inference.inference.layoutelement import LayoutElement, LayoutElements

from unstructured.documents.elements import ElementType
from unstructured.partition.pdf_image.inference_utils import (
    build_layout_elements_from_ocr_regions,
    merge_text_regions,
)


def test_merge_text_regions(mock_embedded_text_regions):
    expected = TextRegion.from_coords(
        x1=437.83888888888885,
        y1=317.319341111111,
        x2=1256.334784222222,
        y2=406.9837855555556,
        text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
    )

    merged_text_region = merge_text_regions(TextRegions.from_list(mock_embedded_text_regions))
    assert merged_text_region == expected


def test_build_layout_elements_from_ocr_regions(mock_embedded_text_regions):
    expected = LayoutElements.from_list(
        [
            LayoutElement.from_coords(
                x1=437.83888888888885,
                y1=317.319341111111,
                x2=1256.334784222222,
                y2=406.9837855555556,
                text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
                type=ElementType.UNCATEGORIZED_TEXT,
            ),
        ]
    )

    elements = build_layout_elements_from_ocr_regions(
        TextRegions.from_list(mock_embedded_text_regions)
    )
    assert elements == expected


def test_build_layout_elements_from_ocr_regions_with_text(mock_embedded_text_regions):
    text = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image"
    expected = LayoutElements.from_list(
        [
            LayoutElement.from_coords(
                x1=437.83888888888885,
                y1=317.319341111111,
                x2=1256.334784222222,
                y2=406.9837855555556,
                text=text,
                type=ElementType.UNCATEGORIZED_TEXT,
            ),
        ]
    )

    elements = build_layout_elements_from_ocr_regions(
        TextRegions.from_list(mock_embedded_text_regions),
        text,
        group_by_ocr_text=True,
    )
    assert elements == expected


def test_build_layout_elements_from_ocr_regions_with_multi_line_text(mock_embedded_text_regions):
    text = "LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image"
    elements = build_layout_elements_from_ocr_regions(
        TextRegions.from_list(mock_embedded_text_regions),
        text,
        group_by_ocr_text=True,
    )
    assert elements == LayoutElements.from_list(
        [
            LayoutElement.from_coords(
                x1=453.00277777777774,
                y1=317.319341111111,
                x2=711.5338541666665,
                y2=358.28571222222206,
                text="LayoutParser:",
                type=ElementType.UNCATEGORIZED_TEXT,
            ),
            LayoutElement.from_coords(
                x1=437.83888888888885,
                y1=317.319341111111,
                x2=1256.334784222222,
                y2=406.9837855555556,
                text="A Unified Toolkit for Deep Learning Based Document Image",
                type=ElementType.UNCATEGORIZED_TEXT,
            ),
        ]
    )


def test_build_layout_elements_from_ocr_regions_with_repeated_texts(mock_embedded_text_regions):
    mock_embedded_text_regions.extend(
        [
            LayoutElement.from_coords(
                x1=453.00277777777774,
                y1=417.319341111111,
                x2=711.5338541666665,
                y2=458.28571222222206,
                text="LayoutParser",
                type=ElementType.UNCATEGORIZED_TEXT,
            ),
            LayoutElement.from_coords(
                x1=453.00277777777774,
                y1=468.319341111111,
                x2=711.5338541666665,
                y2=478.28571222222206,
                text="for",
                type=ElementType.UNCATEGORIZED_TEXT,
            ),
            LayoutElement.from_coords(
                x1=453.00277777777774,
                y1=488.319341111111,
                x2=711.5338541666665,
                y2=500.28571222222206,
                text="Deep",
                type=ElementType.UNCATEGORIZED_TEXT,
            ),
            LayoutElement.from_coords(
                x1=453.00277777777774,
                y1=510.319341111111,
                x2=711.5338541666665,
                y2=550.28571222222206,
                text="Learning",
                type=ElementType.UNCATEGORIZED_TEXT,
            ),
        ]
    )
    text = (
        "LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image\n\n"
        "LayoutParser for Deep Learning"
    )
    elements = build_layout_elements_from_ocr_regions(
        TextRegions.from_list(mock_embedded_text_regions),
        text,
        group_by_ocr_text=True,
    )
    assert elements == LayoutElements.from_list(
        [
            LayoutElement.from_coords(
                x1=453.00277777777774,
                y1=317.319341111111,
                x2=711.5338541666665,
                y2=358.28571222222206,
                text="LayoutParser:",
                type=ElementType.UNCATEGORIZED_TEXT,
            ),
            LayoutElement.from_coords(
                x1=437.83888888888885,
                y1=317.319341111111,
                x2=1256.334784222222,
                y2=406.9837855555556,
                text="A Unified Toolkit for Deep Learning Based Document Image",
                type=ElementType.UNCATEGORIZED_TEXT,
            ),
            LayoutElement.from_coords(
                x1=453.00277777777774,
                y1=417.319341111111,
                x2=711.5338541666665,
                y2=550.28571222222206,
                text="LayoutParser for Deep Learning",
                type=ElementType.UNCATEGORIZED_TEXT,
            ),
        ]
    )
bump `unstructured-inference` (#3711) This PR bumps `unstructured-inference` to `0.8.0`, which introduces vectorized data structure for layout elements and text regions. This PR also cleans up a few places in CI that has repeated definition of env variables or missing installation of testing dependencies in cache. A few document ingest results are changed: - two places for `biomed-api` (actually processed locally on runner) are due to very small changes in numerical results of the bounding box areas: one results in a duplicated page number/header and another results in a deduplication of a word of a sentence that starts in a new line. (yes, two cases goes in opposite directions) - the layout parser paper now outputs the code lines with page number inside the code box as list items --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet <badGarnet@users.noreply.github.com> Co-authored-by: christinestraub <christinemstraub@gmail.com> 2024-10-21 16:55:08 -05:00			`from unstructured_inference.inference.elements import TextRegion, TextRegions`
Feat/refactor layoutelement textregion to vectorized data structure (#3881) This PR refactors the data structure for `list[LayoutElement]` and `list[TextRegion]` used in partition pdf/image files. - new data structure replaces a list of objects with one object with `numpy` array to store data - this only affects partition internal steps and it doesn't change input or output signature of `partition` function itself, i.e., `partition` still returns `list[Element]` - internally `list[LayoutElement]` -> `LayoutElements`; `list[TextRegion]` -> `TextRegions` - current refactor stops before clean up pdfminer elements inside inferred layout elements -> the algorithm of clean up needs to be refactored before the data structure refactor can move forward. So current refactor converts the array data structure into list data structure with `element_array.as_list()` call. This is the last step before turning `list[LayoutElement]` into `list[Element]` as return - a future PR will update this last step so that we build `list[Element]` from `LayoutElements` data structure instead. The goal of this PR is to replace the data structure as much as possible without changing underlying logic. There are a few places where the slicing or filtering logic was simple enough to be converted into vector data structure operations. Those are refactored to be vector based. As a result there is some small improvements observed in ingest test. This is likely because the vector operations cleaned up some previous inconsistency in data types and operations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet <badGarnet@users.noreply.github.com> 2025-01-23 11:11:38 -06:00			`from unstructured_inference.inference.layoutelement import LayoutElement, LayoutElements`
refactor: ocr modules (#2492) The purpose of this PR is to refactor OCR-related modules to reduce unnecessary module imports to avoid potential issues (most likely due to a "circular import"). ### Summary - add `inference_utils` module (unstructured/partition/pdf_image/inference_utils.py) to define unstructured-inference library related utility functions, which will reduce importing unstructured-inference library functions in other files - add `conftest.py` in `test_unstructured/partition/pdf_image/` directory to define fixtures that are available to all tests in the same directory and its subdirectories ### Testing CI should pass 2024-02-06 09:11:55 -08:00
			`from unstructured.documents.elements import ElementType`
			`from unstructured.partition.pdf_image.inference_utils import (`
			`build_layout_elements_from_ocr_regions,`
			`merge_text_regions,`
			`)`


			`def test_merge_text_regions(mock_embedded_text_regions):`
			`expected = TextRegion.from_coords(`
			`x1=437.83888888888885,`
			`y1=317.319341111111,`
			`x2=1256.334784222222,`
			`y2=406.9837855555556,`
			`text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",`
			`)`

bump `unstructured-inference` (#3711) This PR bumps `unstructured-inference` to `0.8.0`, which introduces vectorized data structure for layout elements and text regions. This PR also cleans up a few places in CI that has repeated definition of env variables or missing installation of testing dependencies in cache. A few document ingest results are changed: - two places for `biomed-api` (actually processed locally on runner) are due to very small changes in numerical results of the bounding box areas: one results in a duplicated page number/header and another results in a deduplication of a word of a sentence that starts in a new line. (yes, two cases goes in opposite directions) - the layout parser paper now outputs the code lines with page number inside the code box as list items --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet <badGarnet@users.noreply.github.com> Co-authored-by: christinestraub <christinemstraub@gmail.com> 2024-10-21 16:55:08 -05:00			`merged_text_region = merge_text_regions(TextRegions.from_list(mock_embedded_text_regions))`
refactor: ocr modules (#2492) The purpose of this PR is to refactor OCR-related modules to reduce unnecessary module imports to avoid potential issues (most likely due to a "circular import"). ### Summary - add `inference_utils` module (unstructured/partition/pdf_image/inference_utils.py) to define unstructured-inference library related utility functions, which will reduce importing unstructured-inference library functions in other files - add `conftest.py` in `test_unstructured/partition/pdf_image/` directory to define fixtures that are available to all tests in the same directory and its subdirectories ### Testing CI should pass 2024-02-06 09:11:55 -08:00			`assert merged_text_region == expected`


			`def test_build_layout_elements_from_ocr_regions(mock_embedded_text_regions):`
Feat/refactor layoutelement textregion to vectorized data structure (#3881) This PR refactors the data structure for `list[LayoutElement]` and `list[TextRegion]` used in partition pdf/image files. - new data structure replaces a list of objects with one object with `numpy` array to store data - this only affects partition internal steps and it doesn't change input or output signature of `partition` function itself, i.e., `partition` still returns `list[Element]` - internally `list[LayoutElement]` -> `LayoutElements`; `list[TextRegion]` -> `TextRegions` - current refactor stops before clean up pdfminer elements inside inferred layout elements -> the algorithm of clean up needs to be refactored before the data structure refactor can move forward. So current refactor converts the array data structure into list data structure with `element_array.as_list()` call. This is the last step before turning `list[LayoutElement]` into `list[Element]` as return - a future PR will update this last step so that we build `list[Element]` from `LayoutElements` data structure instead. The goal of this PR is to replace the data structure as much as possible without changing underlying logic. There are a few places where the slicing or filtering logic was simple enough to be converted into vector data structure operations. Those are refactored to be vector based. As a result there is some small improvements observed in ingest test. This is likely because the vector operations cleaned up some previous inconsistency in data types and operations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet <badGarnet@users.noreply.github.com> 2025-01-23 11:11:38 -06:00			`expected = LayoutElements.from_list(`
			`[`
			`LayoutElement.from_coords(`
			`x1=437.83888888888885,`
			`y1=317.319341111111,`
			`x2=1256.334784222222,`
			`y2=406.9837855555556,`
			`text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",`
			`type=ElementType.UNCATEGORIZED_TEXT,`
			`),`
			`]`
			`)`

			`elements = build_layout_elements_from_ocr_regions(`
			`TextRegions.from_list(mock_embedded_text_regions)`
			`)`
refactor: ocr modules (#2492) The purpose of this PR is to refactor OCR-related modules to reduce unnecessary module imports to avoid potential issues (most likely due to a "circular import"). ### Summary - add `inference_utils` module (unstructured/partition/pdf_image/inference_utils.py) to define unstructured-inference library related utility functions, which will reduce importing unstructured-inference library functions in other files - add `conftest.py` in `test_unstructured/partition/pdf_image/` directory to define fixtures that are available to all tests in the same directory and its subdirectories ### Testing CI should pass 2024-02-06 09:11:55 -08:00			`assert elements == expected`
Feat/refactor layoutelement textregion to vectorized data structure (#3881) This PR refactors the data structure for `list[LayoutElement]` and `list[TextRegion]` used in partition pdf/image files. - new data structure replaces a list of objects with one object with `numpy` array to store data - this only affects partition internal steps and it doesn't change input or output signature of `partition` function itself, i.e., `partition` still returns `list[Element]` - internally `list[LayoutElement]` -> `LayoutElements`; `list[TextRegion]` -> `TextRegions` - current refactor stops before clean up pdfminer elements inside inferred layout elements -> the algorithm of clean up needs to be refactored before the data structure refactor can move forward. So current refactor converts the array data structure into list data structure with `element_array.as_list()` call. This is the last step before turning `list[LayoutElement]` into `list[Element]` as return - a future PR will update this last step so that we build `list[Element]` from `LayoutElements` data structure instead. The goal of this PR is to replace the data structure as much as possible without changing underlying logic. There are a few places where the slicing or filtering logic was simple enough to be converted into vector data structure operations. Those are refactored to be vector based. As a result there is some small improvements observed in ingest test. This is likely because the vector operations cleaned up some previous inconsistency in data types and operations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet <badGarnet@users.noreply.github.com> 2025-01-23 11:11:38 -06:00

			`def test_build_layout_elements_from_ocr_regions_with_text(mock_embedded_text_regions):`
			`text = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image"`
			`expected = LayoutElements.from_list(`
			`[`
			`LayoutElement.from_coords(`
			`x1=437.83888888888885,`
			`y1=317.319341111111,`
			`x2=1256.334784222222,`
			`y2=406.9837855555556,`
			`text=text,`
			`type=ElementType.UNCATEGORIZED_TEXT,`
			`),`
			`]`
			`)`

			`elements = build_layout_elements_from_ocr_regions(`
			`TextRegions.from_list(mock_embedded_text_regions),`
			`text,`
			`group_by_ocr_text=True,`
			`)`
			`assert elements == expected`


			`def test_build_layout_elements_from_ocr_regions_with_multi_line_text(mock_embedded_text_regions):`
			`text = "LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image"`
			`elements = build_layout_elements_from_ocr_regions(`
			`TextRegions.from_list(mock_embedded_text_regions),`
			`text,`
			`group_by_ocr_text=True,`
			`)`
			`assert elements == LayoutElements.from_list(`
			`[`
			`LayoutElement.from_coords(`
			`x1=453.00277777777774,`
			`y1=317.319341111111,`
			`x2=711.5338541666665,`
			`y2=358.28571222222206,`
			`text="LayoutParser:",`
			`type=ElementType.UNCATEGORIZED_TEXT,`
			`),`
			`LayoutElement.from_coords(`
			`x1=437.83888888888885,`
			`y1=317.319341111111,`
			`x2=1256.334784222222,`
			`y2=406.9837855555556,`
			`text="A Unified Toolkit for Deep Learning Based Document Image",`
			`type=ElementType.UNCATEGORIZED_TEXT,`
			`),`
			`]`
			`)`
Fix/fix ocr region to elements bug (#3891) This PR fixes a bug in `build_layout_elements_from_ocr_regions` where texts are joint in incorrect orders. The bug is due to incorrect masking of the `ocr_regions` after some are already selected as one of the final groups. The fix uses simpler method to mask the indices by simply use the same indices that adds the regions to the final groups to mask them so they are not considered again. ## Testing This PR adds a unit test specifically aimed for this bug. Without the fix the test would fail. Additionally any PDF files with repeated texts has a potential to trigger this bug. e.g., create a simple pdf use the test text ```python "LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image\n\nLayoutParser for Deep Learning" ``` and partition with `ocr_only` mode on main branch would hit this bug and output text where position of the second "LayoutParser" is incorrect. ```python [ 'LayoutParser:', 'A Unified Toolkit for Deep Learning Based Document Image', 'for Deep Learning LayoutParser', ] ``` 2025-01-29 06:11:17 -06:00

			`def test_build_layout_elements_from_ocr_regions_with_repeated_texts(mock_embedded_text_regions):`
			`mock_embedded_text_regions.extend(`
			`[`
			`LayoutElement.from_coords(`
			`x1=453.00277777777774,`
			`y1=417.319341111111,`
			`x2=711.5338541666665,`
			`y2=458.28571222222206,`
			`text="LayoutParser",`
			`type=ElementType.UNCATEGORIZED_TEXT,`
			`),`
			`LayoutElement.from_coords(`
			`x1=453.00277777777774,`
			`y1=468.319341111111,`
			`x2=711.5338541666665,`
			`y2=478.28571222222206,`
			`text="for",`
			`type=ElementType.UNCATEGORIZED_TEXT,`
			`),`
			`LayoutElement.from_coords(`
			`x1=453.00277777777774,`
			`y1=488.319341111111,`
			`x2=711.5338541666665,`
			`y2=500.28571222222206,`
			`text="Deep",`
			`type=ElementType.UNCATEGORIZED_TEXT,`
			`),`
			`LayoutElement.from_coords(`
			`x1=453.00277777777774,`
			`y1=510.319341111111,`
			`x2=711.5338541666665,`
			`y2=550.28571222222206,`
			`text="Learning",`
			`type=ElementType.UNCATEGORIZED_TEXT,`
			`),`
			`]`
			`)`
			`text = (`
			`"LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image\n\n"`
			`"LayoutParser for Deep Learning"`
			`)`
			`elements = build_layout_elements_from_ocr_regions(`
			`TextRegions.from_list(mock_embedded_text_regions),`
			`text,`
			`group_by_ocr_text=True,`
			`)`
			`assert elements == LayoutElements.from_list(`
			`[`
			`LayoutElement.from_coords(`
			`x1=453.00277777777774,`
			`y1=317.319341111111,`
			`x2=711.5338541666665,`
			`y2=358.28571222222206,`
			`text="LayoutParser:",`
			`type=ElementType.UNCATEGORIZED_TEXT,`
			`),`
			`LayoutElement.from_coords(`
			`x1=437.83888888888885,`
			`y1=317.319341111111,`
			`x2=1256.334784222222,`
			`y2=406.9837855555556,`
			`text="A Unified Toolkit for Deep Learning Based Document Image",`
			`type=ElementType.UNCATEGORIZED_TEXT,`
			`),`
			`LayoutElement.from_coords(`
			`x1=453.00277777777774,`
			`y1=417.319341111111,`
			`x2=711.5338541666665,`
			`y2=550.28571222222206,`
			`text="LayoutParser for Deep Learning",`
			`type=ElementType.UNCATEGORIZED_TEXT,`
			`),`
			`]`
			`)`