mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-03 04:14:15 +00:00
[issue 1237] fix empty coordinates break sorting bug (#1242)
This PR resolves #1237 by checking if any coordinates are `None`; if yes do not attempt to sort with xy cut method and return the list as is.
This commit is contained in:
parent
ed7f991ab9
commit
9191be7ae8
@ -44,6 +44,7 @@
|
|||||||
* Edit `add_pytesseract_bbox_to_elements`'s (`ocr_only` strategy) `metadata.coordinates.points` return type to `Tuple` for consistency.
|
* Edit `add_pytesseract_bbox_to_elements`'s (`ocr_only` strategy) `metadata.coordinates.points` return type to `Tuple` for consistency.
|
||||||
* Re-enable test-ingest-confluence-diff for ingest tests
|
* Re-enable test-ingest-confluence-diff for ingest tests
|
||||||
* Fix syntax for ingest test check number of files
|
* Fix syntax for ingest test check number of files
|
||||||
|
* Fix a bug where `xy-cut` sorting attemps to sort elements without valid coordinates; now xy cut sorting only works when **all** elements have valid coordinates
|
||||||
|
|
||||||
## 0.10.8
|
## 0.10.8
|
||||||
|
|
||||||
|
10
test_unstructured/partition/utils/test_sorting.py
Normal file
10
test_unstructured/partition/utils/test_sorting.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from unstructured.documents.elements import Element
|
||||||
|
from unstructured.partition.utils.sorting import sort_page_elements
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("sort_mode", ["xy-cut", "basic"])
|
||||||
|
def test_sort_page_elements_without_coordinates(sort_mode):
|
||||||
|
elements = [Element(str(idx)) for idx in range(5)]
|
||||||
|
assert sort_page_elements(elements) == elements
|
@ -3,6 +3,7 @@ from typing import List
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from unstructured.documents.elements import CoordinatesMetadata, Element
|
from unstructured.documents.elements import CoordinatesMetadata, Element
|
||||||
|
from unstructured.logger import logger
|
||||||
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
|
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
|
||||||
from unstructured.partition.utils.xycut import recursive_xy_cut
|
from unstructured.partition.utils.xycut import recursive_xy_cut
|
||||||
|
|
||||||
@ -27,7 +28,8 @@ def sort_page_elements(
|
|||||||
- sort_mode (str, optional): The mode by which the elements will be sorted. Default is
|
- sort_mode (str, optional): The mode by which the elements will be sorted. Default is
|
||||||
SORT_MODE_XY_CUT.
|
SORT_MODE_XY_CUT.
|
||||||
- SORT_MODE_XY_CUT: Sorts elements based on XY-cut sorting approach. Requires the
|
- SORT_MODE_XY_CUT: Sorts elements based on XY-cut sorting approach. Requires the
|
||||||
recursive_xy_cut function and coordinates_to_bbox function to be defined.
|
recursive_xy_cut function and coordinates_to_bbox function to be defined. And requires all
|
||||||
|
elements to have valid cooridnates
|
||||||
- SORT_MODE_BASIC: Sorts elements based on their coordinates. Elements without coordinates
|
- SORT_MODE_BASIC: Sorts elements based on their coordinates. Elements without coordinates
|
||||||
will be pushed to the end.
|
will be pushed to the end.
|
||||||
- If an unrecognized sort_mode is provided, the function returns the elements as-is.
|
- If an unrecognized sort_mode is provided, the function returns the elements as-is.
|
||||||
@ -41,10 +43,17 @@ def sort_page_elements(
|
|||||||
|
|
||||||
if sort_mode == SORT_MODE_XY_CUT:
|
if sort_mode == SORT_MODE_XY_CUT:
|
||||||
coordinates_list = [el.metadata.coordinates for el in page_elements]
|
coordinates_list = [el.metadata.coordinates for el in page_elements]
|
||||||
boxes = [coordinates_to_bbox(coords) for coords in coordinates_list]
|
if any(coords is None for coords in coordinates_list):
|
||||||
res: List[int] = []
|
logger.warning(
|
||||||
recursive_xy_cut(np.asarray(boxes).astype(int), np.arange(len(boxes)), res)
|
"some or all elements are missing coordinates from this page so we can't sort the "
|
||||||
sorted_page_elements = [page_elements[i] for i in res]
|
"elements",
|
||||||
|
)
|
||||||
|
sorted_page_elements = page_elements
|
||||||
|
else:
|
||||||
|
boxes = [coordinates_to_bbox(coords) for coords in coordinates_list]
|
||||||
|
res: List[int] = []
|
||||||
|
recursive_xy_cut(np.asarray(boxes).astype(int), np.arange(len(boxes)), res)
|
||||||
|
sorted_page_elements = [page_elements[i] for i in res]
|
||||||
elif sort_mode == SORT_MODE_BASIC:
|
elif sort_mode == SORT_MODE_BASIC:
|
||||||
sorted_page_elements = sorted(
|
sorted_page_elements = sorted(
|
||||||
page_elements,
|
page_elements,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user