fix: type error string indices bug (#1940)

Fix TypeError: string indices must be integers. The `annotation_dict`
variable is conditioned to be `None` if instance type is not dict. Then
we add logic to skip the attempt if the value is `None`.
This commit is contained in:
Klaijan 2023-10-30 20:38:57 -04:00 committed by GitHub
parent c3e42e9ffc
commit a11d4634f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 55 additions and 2 deletions

View File

@ -23,6 +23,7 @@
* **Fix ingest pipeline to be able to use chunking and embedding together** Problem: When ingest pipeline was using chunking and embedding together, embedding outputs were empty and the outputs of chunking couldn't be re-read into memory and be forwarded to embeddings. Fix: Added CompositeElement type to TYPE_TO_TEXT_ELEMENT_MAP to be able to process CompositeElements with unstructured.staging.base.isd_to_elements * **Fix ingest pipeline to be able to use chunking and embedding together** Problem: When ingest pipeline was using chunking and embedding together, embedding outputs were empty and the outputs of chunking couldn't be re-read into memory and be forwarded to embeddings. Fix: Added CompositeElement type to TYPE_TO_TEXT_ELEMENT_MAP to be able to process CompositeElements with unstructured.staging.base.isd_to_elements
* **Fix unnecessary mid-text chunk-splitting.** The "pre-chunker" did not consider separator blank-line ("\n\n") length when grouping elements for a single chunk. As a result, sections were frequently over-populated producing a over-sized chunk that required mid-text splitting. * **Fix unnecessary mid-text chunk-splitting.** The "pre-chunker" did not consider separator blank-line ("\n\n") length when grouping elements for a single chunk. As a result, sections were frequently over-populated producing a over-sized chunk that required mid-text splitting.
* **Fix frequent dissociation of title from chunk.** The sectioning algorithm included the title of the next section with the prior section whenever it would fit, frequently producing association of a section title with the prior section and dissociating it from its actual section. Fix this by performing combination of whole sections only. * **Fix frequent dissociation of title from chunk.** The sectioning algorithm included the title of the next section with the prior section whenever it would fit, frequently producing association of a section title with the prior section and dissociating it from its actual section. Fix this by performing combination of whole sections only.
* **Fix PDF attempt to get dict value from string.** Fixes a rare edge case that prevented some PDF's from being partitioned. The `get_uris_from_annots` function tried to access the dictionary value of a string instance variable. Assign `None` to the annotation variable if the instance type is not dictionary to avoid the erroneous attempt.
## 0.10.27 ## 0.10.27

View File

@ -19,6 +19,7 @@ from unstructured.documents.elements import (
Title, Title,
) )
from unstructured.partition import ocr, pdf, strategies from unstructured.partition import ocr, pdf, strategies
from unstructured.partition.pdf import get_uris_from_annots
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
@ -981,6 +982,51 @@ def test_ocr_language_passes_through(strategy, ocr_func):
assert kwargs["lang"] == "kor" assert kwargs["lang"] == "kor"
@pytest.mark.parametrize(
("annots", "height", "coordinate_system", "page_number", "expected"),
[
(["BS", "BE"], 300, PixelSpace(300, 300), 1, 0),
(
[
{
"Type": "/'Annot'",
"Subtype": "/'Link'",
"A": {
"Type": "/'Action'",
"S": "/'URI'",
"URI": "b'https://layout-parser.github.io'",
},
"BS": {"S": "/'S'", "W": 1},
"Border": [0, 0, 1],
"C": [0, 1, 1],
"H": "/'I'",
"Rect": [304.055, 224.156, 452.472, 234.368],
},
{
"Type": "/'Annot'",
"Subtype": "/'Link'",
"A": {"S": "/'GoTo'", "D": "b'cite.harley2015evaluation'"},
"BS": {"S": "/'S'", "W": 1},
"Border": [0, 0, 1],
"C": [0, 1, 0],
"H": "/'I'",
"Rect": [468.305, 128.081, 480.26, 136.494],
},
],
792,
PixelSpace(612, 792),
1,
2,
),
],
)
def test_get_uris_from_annots_string_annotation(
annots, height, coordinate_system, page_number, expected
):
annotation_list = get_uris_from_annots(annots, height, coordinate_system, page_number)
assert len(annotation_list) == expected
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"]) @pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
("filename", "is_image"), ("filename", "is_image"),

View File

@ -921,8 +921,14 @@ def get_uris_from_annots(
""" """
annotation_list = [] annotation_list = []
for annotation in annots: for annotation in annots:
annotation_dict = try_resolve(annotation) annotation_dict = (
if str(annotation_dict["Subtype"]) != "/'Link'" or "A" not in annotation_dict: try_resolve(annotation) if isinstance(try_resolve(annotation), dict) else None
)
if (
not annotation_dict
or str(annotation_dict["Subtype"]) != "/'Link'"
or "A" not in annotation_dict
):
continue continue
x1, y1, x2, y2 = rect_to_bbox(annotation_dict["Rect"], height) x1, y1, x2, y2 = rect_to_bbox(annotation_dict["Rect"], height)
uri_dict = try_resolve(annotation_dict["A"]) uri_dict = try_resolve(annotation_dict["A"])