mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-01 21:36:57 +00:00

This PR bumps `unstructured-inference` to `0.8.0`, which introduces vectorized data structure for layout elements and text regions. This PR also cleans up a few places in CI that has repeated definition of env variables or missing installation of testing dependencies in cache. A few document ingest results are changed: - two places for `biomed-api` (actually processed locally on runner) are due to very small changes in numerical results of the bounding box areas: one results in a duplicated page number/header and another results in a deduplication of a word of a sentence that starts in a new line. (yes, two cases goes in opposite directions) - the layout parser paper now outputs the code lines with page number inside the code box as list items --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet <badGarnet@users.noreply.github.com> Co-authored-by: christinestraub <christinemstraub@gmail.com>
38 lines
1.3 KiB
Python
38 lines
1.3 KiB
Python
from unstructured_inference.inference.elements import TextRegion, TextRegions
|
|
from unstructured_inference.inference.layoutelement import LayoutElement
|
|
|
|
from unstructured.documents.elements import ElementType
|
|
from unstructured.partition.pdf_image.inference_utils import (
|
|
build_layout_elements_from_ocr_regions,
|
|
merge_text_regions,
|
|
)
|
|
|
|
|
|
def test_merge_text_regions(mock_embedded_text_regions):
|
|
expected = TextRegion.from_coords(
|
|
x1=437.83888888888885,
|
|
y1=317.319341111111,
|
|
x2=1256.334784222222,
|
|
y2=406.9837855555556,
|
|
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
|
|
)
|
|
|
|
merged_text_region = merge_text_regions(TextRegions.from_list(mock_embedded_text_regions))
|
|
assert merged_text_region == expected
|
|
|
|
|
|
def test_build_layout_elements_from_ocr_regions(mock_embedded_text_regions):
|
|
expected = [
|
|
LayoutElement.from_coords(
|
|
x1=437.83888888888885,
|
|
y1=317.319341111111,
|
|
x2=1256.334784222222,
|
|
y2=406.9837855555556,
|
|
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
|
|
type=ElementType.UNCATEGORIZED_TEXT,
|
|
),
|
|
]
|
|
|
|
elements = build_layout_elements_from_ocr_regions(mock_embedded_text_regions)
|
|
assert elements == expected
|