mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-03 06:19:13 +00:00

This PR refactors the data structure for `list[LayoutElement]` and `list[TextRegion]` used in partition pdf/image files. - new data structure replaces a list of objects with one object with `numpy` array to store data - this only affects partition internal steps and it doesn't change input or output signature of `partition` function itself, i.e., `partition` still returns `list[Element]` - internally `list[LayoutElement]` -> `LayoutElements`; `list[TextRegion]` -> `TextRegions` - current refactor stops before clean up pdfminer elements inside inferred layout elements -> the algorithm of clean up needs to be refactored before the data structure refactor can move forward. So current refactor converts the array data structure into list data structure with `element_array.as_list()` call. This is the last step before turning `list[LayoutElement]` into `list[Element]` as return - a future PR will update this last step so that we build `list[Element]` from `LayoutElements` data structure instead. The goal of this PR is to replace the data structure as much as possible without changing underlying logic. There are a few places where the slicing or filtering logic was simple enough to be converted into vector data structure operations. Those are refactored to be vector based. As a result there is some small improvements observed in ingest test. This is likely because the vector operations cleaned up some previous inconsistency in data types and operations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet <badGarnet@users.noreply.github.com>
94 lines
3.2 KiB
Python
94 lines
3.2 KiB
Python
from unstructured_inference.inference.elements import TextRegion, TextRegions
|
|
from unstructured_inference.inference.layoutelement import LayoutElement, LayoutElements
|
|
|
|
from unstructured.documents.elements import ElementType
|
|
from unstructured.partition.pdf_image.inference_utils import (
|
|
build_layout_elements_from_ocr_regions,
|
|
merge_text_regions,
|
|
)
|
|
|
|
|
|
def test_merge_text_regions(mock_embedded_text_regions):
|
|
expected = TextRegion.from_coords(
|
|
x1=437.83888888888885,
|
|
y1=317.319341111111,
|
|
x2=1256.334784222222,
|
|
y2=406.9837855555556,
|
|
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
|
|
)
|
|
|
|
merged_text_region = merge_text_regions(TextRegions.from_list(mock_embedded_text_regions))
|
|
assert merged_text_region == expected
|
|
|
|
|
|
def test_build_layout_elements_from_ocr_regions(mock_embedded_text_regions):
|
|
expected = LayoutElements.from_list(
|
|
[
|
|
LayoutElement.from_coords(
|
|
x1=437.83888888888885,
|
|
y1=317.319341111111,
|
|
x2=1256.334784222222,
|
|
y2=406.9837855555556,
|
|
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
|
|
type=ElementType.UNCATEGORIZED_TEXT,
|
|
),
|
|
]
|
|
)
|
|
|
|
elements = build_layout_elements_from_ocr_regions(
|
|
TextRegions.from_list(mock_embedded_text_regions)
|
|
)
|
|
assert elements == expected
|
|
|
|
|
|
def test_build_layout_elements_from_ocr_regions_with_text(mock_embedded_text_regions):
|
|
text = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image"
|
|
expected = LayoutElements.from_list(
|
|
[
|
|
LayoutElement.from_coords(
|
|
x1=437.83888888888885,
|
|
y1=317.319341111111,
|
|
x2=1256.334784222222,
|
|
y2=406.9837855555556,
|
|
text=text,
|
|
type=ElementType.UNCATEGORIZED_TEXT,
|
|
),
|
|
]
|
|
)
|
|
|
|
elements = build_layout_elements_from_ocr_regions(
|
|
TextRegions.from_list(mock_embedded_text_regions),
|
|
text,
|
|
group_by_ocr_text=True,
|
|
)
|
|
assert elements == expected
|
|
|
|
|
|
def test_build_layout_elements_from_ocr_regions_with_multi_line_text(mock_embedded_text_regions):
|
|
text = "LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image"
|
|
elements = build_layout_elements_from_ocr_regions(
|
|
TextRegions.from_list(mock_embedded_text_regions),
|
|
text,
|
|
group_by_ocr_text=True,
|
|
)
|
|
assert elements == LayoutElements.from_list(
|
|
[
|
|
LayoutElement.from_coords(
|
|
x1=453.00277777777774,
|
|
y1=317.319341111111,
|
|
x2=711.5338541666665,
|
|
y2=358.28571222222206,
|
|
text="LayoutParser:",
|
|
type=ElementType.UNCATEGORIZED_TEXT,
|
|
),
|
|
LayoutElement.from_coords(
|
|
x1=437.83888888888885,
|
|
y1=317.319341111111,
|
|
x2=1256.334784222222,
|
|
y2=406.9837855555556,
|
|
text="A Unified Toolkit for Deep Learning Based Document Image",
|
|
type=ElementType.UNCATEGORIZED_TEXT,
|
|
),
|
|
]
|
|
)
|