mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 12:19:36 +00:00
fix: coordinates bug on pdf parsing (#1462)
Addresses: https://github.com/Unstructured-IO/unstructured/issues/1460 We were raising an error with invalid coordinates, which prevented us from continuing to return the element and continue parsing the pdf. Now instead of raising the error we'll return early. to test: ``` from unstructured.partition.auto import partition elements = partition(url='https://www.apple.com/environment/pdf/Apple_Environmental_Progress_Report_2022.pdf', strategy="fast") ``` --------- Co-authored-by: cragwolfe <crag@unstructured.io>
This commit is contained in:
parent
b54994ae95
commit
e359afafbe
@ -1,5 +1,4 @@
|
||||
## 0.10.16-dev2
|
||||
|
||||
## 0.10.16
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +9,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* ***Fixes an issue that caused a partition error for some PDF's.** Fixes GH Issue 1460 by bypassing a coordinate check if an element has invalid coordinates.
|
||||
|
||||
## 0.10.15
|
||||
|
||||
### Enhancements
|
||||
|
BIN
example-docs/negative-coords.pdf
Normal file
BIN
example-docs/negative-coords.pdf
Normal file
Binary file not shown.
@ -226,6 +226,14 @@ def test_partition_pdf_with_fast_strategy(
|
||||
assert element.metadata.filename == "layout-parser-paper-fast.pdf"
|
||||
|
||||
|
||||
def test_partition_pdf_with_fast_neg_coordinates():
|
||||
filename = "example-docs/negative-coords.pdf"
|
||||
elements = pdf.partition_pdf(filename=filename, url=None, strategy="fast")
|
||||
assert len(elements) == 5
|
||||
assert elements[0].metadata.coordinates.points[0][0] < 0
|
||||
assert elements[0].metadata.coordinates.points[1][0] < 0
|
||||
|
||||
|
||||
def test_partition_pdf_with_fast_groups_text(
|
||||
filename="example-docs/layout-parser-paper-fast.pdf",
|
||||
):
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.10.16-dev2" # pragma: no cover
|
||||
__version__ = "0.10.16" # pragma: no cover
|
||||
|
@ -27,7 +27,7 @@ from unstructured.file_utils.filetype import (
|
||||
FileType,
|
||||
add_metadata_with_filetype,
|
||||
)
|
||||
from unstructured.logger import logger
|
||||
from unstructured.logger import logger, trace_logger
|
||||
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
|
||||
from unstructured.partition.common import (
|
||||
convert_to_bytes,
|
||||
@ -761,7 +761,10 @@ def check_coords_within_boundary(
|
||||
a float ranges from [0,1] to scale the horizontal (x-axis) boundary
|
||||
"""
|
||||
if not coord_has_valid_points(coordinates) and not coord_has_valid_points(boundary):
|
||||
raise ValueError("Invalid coordinates.")
|
||||
trace_logger.detail( # type: ignore
|
||||
f"coordinates {coordinates} and boundary {boundary} did not pass validation",
|
||||
)
|
||||
return False
|
||||
|
||||
boundary_x_min = boundary.points[0][0]
|
||||
boundary_x_max = boundary.points[2][0]
|
||||
|
Loading…
x
Reference in New Issue
Block a user