fix: coordinates bug on pdf parsing (#1462)

Addresses: https://github.com/Unstructured-IO/unstructured/issues/1460

We were raising an error with invalid coordinates, which prevented us
from continuing to return the element and continue parsing the pdf. Now
instead of raising the error we'll return early.

to test:
```
from unstructured.partition.auto import partition

elements = partition(url='https://www.apple.com/environment/pdf/Apple_Environmental_Progress_Report_2022.pdf', strategy="fast")
```

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
This commit is contained in:
Amanda Cameron 2023-09-19 19:25:31 -07:00 committed by GitHub
parent b54994ae95
commit e359afafbe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 17 additions and 5 deletions

View File

@ -1,5 +1,4 @@
## 0.10.16-dev2
## 0.10.16
### Enhancements
@ -10,6 +9,8 @@
### Fixes
* ***Fixes an issue that caused a partition error for some PDF's.** Fixes GH Issue 1460 by bypassing a coordinate check if an element has invalid coordinates.
## 0.10.15
### Enhancements

Binary file not shown.

View File

@ -226,6 +226,14 @@ def test_partition_pdf_with_fast_strategy(
assert element.metadata.filename == "layout-parser-paper-fast.pdf"
def test_partition_pdf_with_fast_neg_coordinates():
filename = "example-docs/negative-coords.pdf"
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
assert len(elements) == 5
assert elements[0].metadata.coordinates.points[0][0] < 0
assert elements[0].metadata.coordinates.points[1][0] < 0
def test_partition_pdf_with_fast_groups_text(
filename="example-docs/layout-parser-paper-fast.pdf",
):

View File

@ -1 +1 @@
__version__ = "0.10.16-dev2" # pragma: no cover
__version__ = "0.10.16" # pragma: no cover

View File

@ -27,7 +27,7 @@ from unstructured.file_utils.filetype import (
FileType,
add_metadata_with_filetype,
)
from unstructured.logger import logger
from unstructured.logger import logger, trace_logger
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
from unstructured.partition.common import (
convert_to_bytes,
@ -761,7 +761,10 @@ def check_coords_within_boundary(
a float ranges from [0,1] to scale the horizontal (x-axis) boundary
"""
if not coord_has_valid_points(coordinates) and not coord_has_valid_points(boundary):
raise ValueError("Invalid coordinates.")
trace_logger.detail( # type: ignore
f"coordinates {coordinates} and boundary {boundary} did not pass validation",
)
return False
boundary_x_min = boundary.points[0][0]
boundary_x_max = boundary.points[2][0]