diff --git a/CHANGELOG.md b/CHANGELOG.md index dff1f4688..5cc69a44c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ * **Fix ingest pipeline to be able to use chunking and embedding together** Problem: When ingest pipeline was using chunking and embedding together, embedding outputs were empty and the outputs of chunking couldn't be re-read into memory and be forwarded to embeddings. Fix: Added CompositeElement type to TYPE_TO_TEXT_ELEMENT_MAP to be able to process CompositeElements with unstructured.staging.base.isd_to_elements * **Fix unnecessary mid-text chunk-splitting.** The "pre-chunker" did not consider separator blank-line ("\n\n") length when grouping elements for a single chunk. As a result, sections were frequently over-populated producing a over-sized chunk that required mid-text splitting. * **Fix frequent dissociation of title from chunk.** The sectioning algorithm included the title of the next section with the prior section whenever it would fit, frequently producing association of a section title with the prior section and dissociating it from its actual section. Fix this by performing combination of whole sections only. +* **Fix PDF attempt to get dict value from string.** Fixes a rare edge case that prevented some PDF's from being partitioned. The `get_uris_from_annots` function tried to access the dictionary value of a string instance variable. Assign `None` to the annotation variable if the instance type is not dictionary to avoid the erroneous attempt. ## 0.10.27 diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 34d56b8a3..fe386546c 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -19,6 +19,7 @@ from unstructured.documents.elements import ( Title, ) from unstructured.partition import ocr, pdf, strategies +from unstructured.partition.pdf import get_uris_from_annots from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA @@ -981,6 +982,51 @@ def test_ocr_language_passes_through(strategy, ocr_func): assert kwargs["lang"] == "kor" +@pytest.mark.parametrize( + ("annots", "height", "coordinate_system", "page_number", "expected"), + [ + (["BS", "BE"], 300, PixelSpace(300, 300), 1, 0), + ( + [ + { + "Type": "/'Annot'", + "Subtype": "/'Link'", + "A": { + "Type": "/'Action'", + "S": "/'URI'", + "URI": "b'https://layout-parser.github.io'", + }, + "BS": {"S": "/'S'", "W": 1}, + "Border": [0, 0, 1], + "C": [0, 1, 1], + "H": "/'I'", + "Rect": [304.055, 224.156, 452.472, 234.368], + }, + { + "Type": "/'Annot'", + "Subtype": "/'Link'", + "A": {"S": "/'GoTo'", "D": "b'cite.harley2015evaluation'"}, + "BS": {"S": "/'S'", "W": 1}, + "Border": [0, 0, 1], + "C": [0, 1, 0], + "H": "/'I'", + "Rect": [468.305, 128.081, 480.26, 136.494], + }, + ], + 792, + PixelSpace(612, 792), + 1, + 2, + ), + ], +) +def test_get_uris_from_annots_string_annotation( + annots, height, coordinate_system, page_number, expected +): + annotation_list = get_uris_from_annots(annots, height, coordinate_system, page_number) + assert len(annotation_list) == expected + + @pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"]) @pytest.mark.parametrize( ("filename", "is_image"), diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index eb42d4303..be10422fb 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -921,8 +921,14 @@ def get_uris_from_annots( """ annotation_list = [] for annotation in annots: - annotation_dict = try_resolve(annotation) - if str(annotation_dict["Subtype"]) != "/'Link'" or "A" not in annotation_dict: + annotation_dict = ( + try_resolve(annotation) if isinstance(try_resolve(annotation), dict) else None + ) + if ( + not annotation_dict + or str(annotation_dict["Subtype"]) != "/'Link'" + or "A" not in annotation_dict + ): continue x1, y1, x2, y2 = rect_to_bbox(annotation_dict["Rect"], height) uri_dict = try_resolve(annotation_dict["A"])