diff --git a/CHANGELOG.md b/CHANGELOG.md index b230b7958..b72ed86c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,4 @@ -## 0.10.16-dev2 - +## 0.10.16 ### Enhancements @@ -10,6 +9,8 @@ ### Fixes +* ***Fixes an issue that caused a partition error for some PDF's.** Fixes GH Issue 1460 by bypassing a coordinate check if an element has invalid coordinates. + ## 0.10.15 ### Enhancements diff --git a/example-docs/negative-coords.pdf b/example-docs/negative-coords.pdf new file mode 100644 index 000000000..617bd8a47 Binary files /dev/null and b/example-docs/negative-coords.pdf differ diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index a3209a9cf..717824823 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -226,6 +226,14 @@ def test_partition_pdf_with_fast_strategy( assert element.metadata.filename == "layout-parser-paper-fast.pdf" +def test_partition_pdf_with_fast_neg_coordinates(): + filename = "example-docs/negative-coords.pdf" + elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast") + assert len(elements) == 5 + assert elements[0].metadata.coordinates.points[0][0] < 0 + assert elements[0].metadata.coordinates.points[1][0] < 0 + + def test_partition_pdf_with_fast_groups_text( filename="example-docs/layout-parser-paper-fast.pdf", ): diff --git a/unstructured/__version__.py b/unstructured/__version__.py index b6ecb5cd2..eb57d870d 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.16-dev2" # pragma: no cover +__version__ = "0.10.16" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 1cfd26425..1c451ca42 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -27,7 +27,7 @@ from unstructured.file_utils.filetype import ( FileType, add_metadata_with_filetype, ) -from unstructured.logger import logger +from unstructured.logger import logger, trace_logger from unstructured.nlp.patterns import PARAGRAPH_PATTERN from unstructured.partition.common import ( convert_to_bytes, @@ -761,7 +761,10 @@ def check_coords_within_boundary( a float ranges from [0,1] to scale the horizontal (x-axis) boundary """ if not coord_has_valid_points(coordinates) and not coord_has_valid_points(boundary): - raise ValueError("Invalid coordinates.") + trace_logger.detail( # type: ignore + f"coordinates {coordinates} and boundary {boundary} did not pass validation", + ) + return False boundary_x_min = boundary.points[0][0] boundary_x_max = boundary.points[2][0]