[issue 1237] fix empty coordinates break sorting bug (#1242)

This PR resolves #1237 by checking if any coordinates are `None`; if yes
do not attempt to sort with xy cut method and return the list as is.
This commit is contained in:
Yao You 2023-08-31 22:15:10 -05:00 committed by GitHub
parent ed7f991ab9
commit 9191be7ae8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 25 additions and 5 deletions

View File

@ -44,6 +44,7 @@
* Edit `add_pytesseract_bbox_to_elements`'s (`ocr_only` strategy) `metadata.coordinates.points` return type to `Tuple` for consistency.
* Re-enable test-ingest-confluence-diff for ingest tests
* Fix syntax for ingest test check number of files
* Fix a bug where `xy-cut` sorting attemps to sort elements without valid coordinates; now xy cut sorting only works when **all** elements have valid coordinates
## 0.10.8

View File

@ -0,0 +1,10 @@
import pytest
from unstructured.documents.elements import Element
from unstructured.partition.utils.sorting import sort_page_elements
@pytest.mark.parametrize("sort_mode", ["xy-cut", "basic"])
def test_sort_page_elements_without_coordinates(sort_mode):
elements = [Element(str(idx)) for idx in range(5)]
assert sort_page_elements(elements) == elements

View File

@ -3,6 +3,7 @@ from typing import List
import numpy as np
from unstructured.documents.elements import CoordinatesMetadata, Element
from unstructured.logger import logger
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
from unstructured.partition.utils.xycut import recursive_xy_cut
@ -27,7 +28,8 @@ def sort_page_elements(
- sort_mode (str, optional): The mode by which the elements will be sorted. Default is
SORT_MODE_XY_CUT.
- SORT_MODE_XY_CUT: Sorts elements based on XY-cut sorting approach. Requires the
recursive_xy_cut function and coordinates_to_bbox function to be defined.
recursive_xy_cut function and coordinates_to_bbox function to be defined. And requires all
elements to have valid cooridnates
- SORT_MODE_BASIC: Sorts elements based on their coordinates. Elements without coordinates
will be pushed to the end.
- If an unrecognized sort_mode is provided, the function returns the elements as-is.
@ -41,10 +43,17 @@ def sort_page_elements(
if sort_mode == SORT_MODE_XY_CUT:
coordinates_list = [el.metadata.coordinates for el in page_elements]
boxes = [coordinates_to_bbox(coords) for coords in coordinates_list]
res: List[int] = []
recursive_xy_cut(np.asarray(boxes).astype(int), np.arange(len(boxes)), res)
sorted_page_elements = [page_elements[i] for i in res]
if any(coords is None for coords in coordinates_list):
logger.warning(
"some or all elements are missing coordinates from this page so we can't sort the "
"elements",
)
sorted_page_elements = page_elements
else:
boxes = [coordinates_to_bbox(coords) for coords in coordinates_list]
res: List[int] = []
recursive_xy_cut(np.asarray(boxes).astype(int), np.arange(len(boxes)), res)
sorted_page_elements = [page_elements[i] for i in res]
elif sort_mode == SORT_MODE_BASIC:
sorted_page_elements = sorted(
page_elements,