2024-10-21 16:55:08 -05:00
|
|
|
from unstructured_inference.inference.elements import TextRegion, TextRegions
|
2024-02-06 09:11:55 -08:00
|
|
|
from unstructured_inference.inference.layoutelement import LayoutElement
|
|
|
|
|
|
|
|
from unstructured.documents.elements import ElementType
|
|
|
|
from unstructured.partition.pdf_image.inference_utils import (
|
|
|
|
build_layout_elements_from_ocr_regions,
|
|
|
|
merge_text_regions,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_merge_text_regions(mock_embedded_text_regions):
|
|
|
|
expected = TextRegion.from_coords(
|
|
|
|
x1=437.83888888888885,
|
|
|
|
y1=317.319341111111,
|
|
|
|
x2=1256.334784222222,
|
|
|
|
y2=406.9837855555556,
|
|
|
|
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
|
|
|
|
)
|
|
|
|
|
2024-10-21 16:55:08 -05:00
|
|
|
merged_text_region = merge_text_regions(TextRegions.from_list(mock_embedded_text_regions))
|
2024-02-06 09:11:55 -08:00
|
|
|
assert merged_text_region == expected
|
|
|
|
|
|
|
|
|
|
|
|
def test_build_layout_elements_from_ocr_regions(mock_embedded_text_regions):
|
|
|
|
expected = [
|
|
|
|
LayoutElement.from_coords(
|
|
|
|
x1=437.83888888888885,
|
|
|
|
y1=317.319341111111,
|
|
|
|
x2=1256.334784222222,
|
|
|
|
y2=406.9837855555556,
|
|
|
|
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
|
|
|
|
type=ElementType.UNCATEGORIZED_TEXT,
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
elements = build_layout_elements_from_ocr_regions(mock_embedded_text_regions)
|
|
|
|
assert elements == expected
|