Christine Straub 29b9ea7ba6
refactor: ocr modules (#2492)
The purpose of this PR is to refactor OCR-related modules to reduce
unnecessary module imports to avoid potential issues (most likely due to
a "circular import").

### Summary
- add `inference_utils` module
(unstructured/partition/pdf_image/inference_utils.py) to define
unstructured-inference library related utility functions, which will
reduce importing unstructured-inference library functions in other files
- add `conftest.py` in `test_unstructured/partition/pdf_image/`
directory to define fixtures that are available to all tests in the same
directory and its subdirectories

### Testing
CI should pass
2024-02-06 17:11:55 +00:00

38 lines
1.3 KiB
Python

from unstructured_inference.inference.elements import TextRegion
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured.documents.elements import ElementType
from unstructured.partition.pdf_image.inference_utils import (
build_layout_elements_from_ocr_regions,
merge_text_regions,
)
def test_merge_text_regions(mock_embedded_text_regions):
expected = TextRegion.from_coords(
x1=437.83888888888885,
y1=317.319341111111,
x2=1256.334784222222,
y2=406.9837855555556,
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
)
merged_text_region = merge_text_regions(mock_embedded_text_regions)
assert merged_text_region == expected
def test_build_layout_elements_from_ocr_regions(mock_embedded_text_regions):
expected = [
LayoutElement.from_coords(
x1=437.83888888888885,
y1=317.319341111111,
x2=1256.334784222222,
y2=406.9837855555556,
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
type=ElementType.UNCATEGORIZED_TEXT,
),
]
elements = build_layout_elements_from_ocr_regions(mock_embedded_text_regions)
assert elements == expected