fix: type error string indices bug (#1940)

Fix TypeError: string indices must be integers. The `annotation_dict` variable is conditioned to be `None` if instance type is not dict. Then we add logic to skip the attempt if the value is `None`.
2025-12-06 20:13:04 +00:00 · 2023-10-30 20:38:57 -04:00 · 2023-10-30 20:38:57 -04:00 · a11d4634f1
commit a11d4634f1
parent c3e42e9ffc
3 changed files with 55 additions and 2 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -23,6 +23,7 @@
 * **Fix ingest pipeline to be able to use chunking and embedding together** Problem: When ingest pipeline was using chunking and embedding together, embedding outputs were empty and the outputs of chunking couldn't be re-read into memory and be forwarded to embeddings. Fix: Added CompositeElement type to TYPE_TO_TEXT_ELEMENT_MAP to be able to process CompositeElements with unstructured.staging.base.isd_to_elements
 * **Fix unnecessary mid-text chunk-splitting.** The "pre-chunker" did not consider separator blank-line ("\n\n") length when grouping elements for a single chunk. As a result, sections were frequently over-populated producing a over-sized chunk that required mid-text splitting.
 * **Fix frequent dissociation of title from chunk.** The sectioning algorithm included the title of the next section with the prior section whenever it would fit, frequently producing association of a section title with the prior section and dissociating it from its actual section. Fix this by performing combination of whole sections only.
 * **Fix PDF attempt to get dict value from string.** Fixes a rare edge case that prevented some PDF's from being partitioned. The `get_uris_from_annots` function tried to access the dictionary value of a string instance variable. Assign `None` to the annotation variable if the instance type is not dictionary to avoid the erroneous attempt.
 ## 0.10.27
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@ -19,6 +19,7 @@ from unstructured.documents.elements import (
    Title,
 )
 from unstructured.partition import ocr, pdf, strategies
 from unstructured.partition.pdf import get_uris_from_annots
 from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
@ -981,6 +982,51 @@ def test_ocr_language_passes_through(strategy, ocr_func):
    assert kwargs["lang"] == "kor"
@pytest.mark.parametrize(
    ("annots", "height", "coordinate_system", "page_number", "expected"),
    [
        (["BS", "BE"], 300, PixelSpace(300, 300), 1, 0),
        (
            [
                {
                    "Type": "/'Annot'",
                    "Subtype": "/'Link'",
                    "A": {
                        "Type": "/'Action'",
                        "S": "/'URI'",
                        "URI": "b'https://layout-parser.github.io'",
                    },
                    "BS": {"S": "/'S'", "W": 1},
                    "Border": [0, 0, 1],
                    "C": [0, 1, 1],
                    "H": "/'I'",
                    "Rect": [304.055, 224.156, 452.472, 234.368],
                },
                {
                    "Type": "/'Annot'",
                    "Subtype": "/'Link'",
                    "A": {"S": "/'GoTo'", "D": "b'cite.harley2015evaluation'"},
                    "BS": {"S": "/'S'", "W": 1},
                    "Border": [0, 0, 1],
                    "C": [0, 1, 0],
                    "H": "/'I'",
                    "Rect": [468.305, 128.081, 480.26, 136.494],
                },
            ],
            792,
            PixelSpace(612, 792),
            1,
            2,
        ),
    ],
 )
 def test_get_uris_from_annots_string_annotation(
    annots, height, coordinate_system, page_number, expected
 ):
    annotation_list = get_uris_from_annots(annots, height, coordinate_system, page_number)
    assert len(annotation_list) == expected
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
    ("filename", "is_image"),
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -921,8 +921,14 @@ def get_uris_from_annots(
    """
    annotation_list = []
    for annotation in annots:
-        annotation_dict = try_resolve(annotation)
+        annotation_dict = (
-        if str(annotation_dict["Subtype"]) != "/'Link'" or "A" not in annotation_dict:
+            try_resolve(annotation) if isinstance(try_resolve(annotation), dict) else None
        )
        if (
            not annotation_dict
            or str(annotation_dict["Subtype"]) != "/'Link'"
            or "A" not in annotation_dict
        ):
            continue
        x1, y1, x2, y2 = rect_to_bbox(annotation_dict["Rect"], height)
        uri_dict = try_resolve(annotation_dict["A"])