mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 23:20:35 +00:00

The default sorting algorithm for PDF's, "xycut," would cause an error when partitioning a document if Y coordinate points were negative. This change checks for that condition (or more broadly, any negative coordinates) and falls back to the "basic" sort if that is the case. This PR does not address the underlying issue of "bad points" which still should be investigated. However, the sorting code should be less brittle to unexpected bounding boxes in the first case. Resolves: https://github.com/Unstructured-IO/unstructured/issues/1296
101 lines
3.3 KiB
Python
101 lines
3.3 KiB
Python
import pytest
|
|
|
|
from unstructured.documents.coordinates import PixelSpace
|
|
from unstructured.documents.elements import CoordinatesMetadata, Element, Text
|
|
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
|
|
from unstructured.partition.utils.sorting import (
|
|
coord_has_valid_points,
|
|
sort_page_elements,
|
|
)
|
|
|
|
|
|
def test_coord_valid_coordinates():
|
|
coordinates = CoordinatesMetadata([(1, 2), (3, 4), (5, 6), (7, 8)], PixelSpace)
|
|
assert coord_has_valid_points(coordinates) is True
|
|
|
|
|
|
def test_coord_missing_incomplete_point():
|
|
coordinates = CoordinatesMetadata([(1, 2), (3, 4), (5, 6)], PixelSpace)
|
|
assert coord_has_valid_points(coordinates) is False
|
|
|
|
|
|
def test_coord_negative_values():
|
|
coordinates = CoordinatesMetadata([(1, 2), (3, 4), (5, -6), (7, 8)], PixelSpace)
|
|
assert coord_has_valid_points(coordinates) is False
|
|
|
|
|
|
def test_coord_weird_values():
|
|
coordinates = CoordinatesMetadata([(1, 2), ("3", 4), (5, 6), (7, 8)], PixelSpace)
|
|
assert coord_has_valid_points(coordinates) is False
|
|
|
|
|
|
def test_coord_invalid_point_structure():
|
|
coordinates = CoordinatesMetadata([(1, 2), (3, 4, 5), (6, 7), (8, 9)], PixelSpace)
|
|
assert coord_has_valid_points(coordinates) is False
|
|
|
|
|
|
@pytest.mark.parametrize("sort_mode", ["xy-cut", "basic"])
|
|
def test_sort_page_elements_without_coordinates(sort_mode):
|
|
elements = [Element(str(idx)) for idx in range(5)]
|
|
assert sort_page_elements(elements) == elements
|
|
|
|
|
|
def test_sort_xycut_neg_coordinates():
|
|
elements = []
|
|
for idx in range(2):
|
|
elem = Text(str(idx))
|
|
elem.metadata.coordinates = CoordinatesMetadata(
|
|
[(0, idx), (3, 4), (6, 7), (8, 9)],
|
|
PixelSpace,
|
|
)
|
|
elements.append(elem)
|
|
|
|
# NOTE(crag): xycut not attempted, sort_page_elements returns original list
|
|
assert sort_page_elements(elements, sort_mode=SORT_MODE_XY_CUT) is not elements
|
|
|
|
|
|
def test_sort_xycut_pos_coordinates():
|
|
elements = []
|
|
for idx in range(2):
|
|
elem = Text(str(idx))
|
|
elem.metadata.coordinates = CoordinatesMetadata(
|
|
[(1, 2), (3, 4), (6, 7), (8, 9)],
|
|
PixelSpace,
|
|
)
|
|
elements.append(elem)
|
|
|
|
# NOTE(crag): xycut ran, so different list reference returned from input list
|
|
assert sort_page_elements(elements, sort_mode=SORT_MODE_XY_CUT) is not elements
|
|
|
|
|
|
def test_sort_basic_neg_coordinates():
|
|
elements = []
|
|
for idx in range(3):
|
|
elem = Text(str(idx))
|
|
elem.metadata.coordinates = CoordinatesMetadata(
|
|
[(1, -idx), (3, 4), (6, 7), (8, 9)],
|
|
PixelSpace,
|
|
)
|
|
elements.append(elem)
|
|
|
|
sorted_page_elements = sort_page_elements(elements, sort_mode=SORT_MODE_BASIC)
|
|
sorted_elem_text = " ".join([str(elem.text) for elem in sorted_page_elements])
|
|
assert sorted_elem_text == "2 1 0"
|
|
|
|
|
|
def test_sort_basic_pos_coordinates():
|
|
elements = []
|
|
for idx in range(3):
|
|
elem = Text(str(9 - idx))
|
|
elem.metadata.coordinates = CoordinatesMetadata(
|
|
[(1, 9 - idx), (3, 4), (6, 7), (8, 9)],
|
|
PixelSpace,
|
|
)
|
|
elements.append(elem)
|
|
|
|
sorted_page_elements = sort_page_elements(elements, sort_mode=SORT_MODE_BASIC)
|
|
assert sorted_page_elements is not elements
|
|
|
|
sorted_elem_text = " ".join([str(elem.text) for elem in sorted_page_elements])
|
|
assert sorted_elem_text == "7 8 9"
|