From 9191be7ae89ea8234712bcdfafdba9df4ce92f70 Mon Sep 17 00:00:00 2001 From: Yao You Date: Thu, 31 Aug 2023 22:15:10 -0500 Subject: [PATCH] [issue 1237] fix empty coordinates break sorting bug (#1242) This PR resolves #1237 by checking if any coordinates are `None`; if yes do not attempt to sort with xy cut method and return the list as is. --- CHANGELOG.md | 1 + .../partition/utils/test_sorting.py | 10 ++++++++++ unstructured/partition/utils/sorting.py | 19 ++++++++++++++----- 3 files changed, 25 insertions(+), 5 deletions(-) create mode 100644 test_unstructured/partition/utils/test_sorting.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 7594afc96..744245a6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ * Edit `add_pytesseract_bbox_to_elements`'s (`ocr_only` strategy) `metadata.coordinates.points` return type to `Tuple` for consistency. * Re-enable test-ingest-confluence-diff for ingest tests * Fix syntax for ingest test check number of files +* Fix a bug where `xy-cut` sorting attemps to sort elements without valid coordinates; now xy cut sorting only works when **all** elements have valid coordinates ## 0.10.8 diff --git a/test_unstructured/partition/utils/test_sorting.py b/test_unstructured/partition/utils/test_sorting.py new file mode 100644 index 000000000..d372da268 --- /dev/null +++ b/test_unstructured/partition/utils/test_sorting.py @@ -0,0 +1,10 @@ +import pytest + +from unstructured.documents.elements import Element +from unstructured.partition.utils.sorting import sort_page_elements + + +@pytest.mark.parametrize("sort_mode", ["xy-cut", "basic"]) +def test_sort_page_elements_without_coordinates(sort_mode): + elements = [Element(str(idx)) for idx in range(5)] + assert sort_page_elements(elements) == elements diff --git a/unstructured/partition/utils/sorting.py b/unstructured/partition/utils/sorting.py index 79293fe5f..897a343f9 100644 --- a/unstructured/partition/utils/sorting.py +++ b/unstructured/partition/utils/sorting.py @@ -3,6 +3,7 @@ from typing import List import numpy as np from unstructured.documents.elements import CoordinatesMetadata, Element +from unstructured.logger import logger from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT from unstructured.partition.utils.xycut import recursive_xy_cut @@ -27,7 +28,8 @@ def sort_page_elements( - sort_mode (str, optional): The mode by which the elements will be sorted. Default is SORT_MODE_XY_CUT. - SORT_MODE_XY_CUT: Sorts elements based on XY-cut sorting approach. Requires the - recursive_xy_cut function and coordinates_to_bbox function to be defined. + recursive_xy_cut function and coordinates_to_bbox function to be defined. And requires all + elements to have valid cooridnates - SORT_MODE_BASIC: Sorts elements based on their coordinates. Elements without coordinates will be pushed to the end. - If an unrecognized sort_mode is provided, the function returns the elements as-is. @@ -41,10 +43,17 @@ def sort_page_elements( if sort_mode == SORT_MODE_XY_CUT: coordinates_list = [el.metadata.coordinates for el in page_elements] - boxes = [coordinates_to_bbox(coords) for coords in coordinates_list] - res: List[int] = [] - recursive_xy_cut(np.asarray(boxes).astype(int), np.arange(len(boxes)), res) - sorted_page_elements = [page_elements[i] for i in res] + if any(coords is None for coords in coordinates_list): + logger.warning( + "some or all elements are missing coordinates from this page so we can't sort the " + "elements", + ) + sorted_page_elements = page_elements + else: + boxes = [coordinates_to_bbox(coords) for coords in coordinates_list] + res: List[int] = [] + recursive_xy_cut(np.asarray(boxes).astype(int), np.arange(len(boxes)), res) + sorted_page_elements = [page_elements[i] for i in res] elif sort_mode == SORT_MODE_BASIC: sorted_page_elements = sorted( page_elements,