mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-09 10:05:56 +00:00
fix: type error string indices bug (#1940)
Fix TypeError: string indices must be integers. The `annotation_dict` variable is conditioned to be `None` if instance type is not dict. Then we add logic to skip the attempt if the value is `None`.
This commit is contained in:
parent
c3e42e9ffc
commit
a11d4634f1
@ -23,6 +23,7 @@
|
||||
* **Fix ingest pipeline to be able to use chunking and embedding together** Problem: When ingest pipeline was using chunking and embedding together, embedding outputs were empty and the outputs of chunking couldn't be re-read into memory and be forwarded to embeddings. Fix: Added CompositeElement type to TYPE_TO_TEXT_ELEMENT_MAP to be able to process CompositeElements with unstructured.staging.base.isd_to_elements
|
||||
* **Fix unnecessary mid-text chunk-splitting.** The "pre-chunker" did not consider separator blank-line ("\n\n") length when grouping elements for a single chunk. As a result, sections were frequently over-populated producing a over-sized chunk that required mid-text splitting.
|
||||
* **Fix frequent dissociation of title from chunk.** The sectioning algorithm included the title of the next section with the prior section whenever it would fit, frequently producing association of a section title with the prior section and dissociating it from its actual section. Fix this by performing combination of whole sections only.
|
||||
* **Fix PDF attempt to get dict value from string.** Fixes a rare edge case that prevented some PDF's from being partitioned. The `get_uris_from_annots` function tried to access the dictionary value of a string instance variable. Assign `None` to the annotation variable if the instance type is not dictionary to avoid the erroneous attempt.
|
||||
|
||||
## 0.10.27
|
||||
|
||||
|
@ -19,6 +19,7 @@ from unstructured.documents.elements import (
|
||||
Title,
|
||||
)
|
||||
from unstructured.partition import ocr, pdf, strategies
|
||||
from unstructured.partition.pdf import get_uris_from_annots
|
||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||
|
||||
|
||||
@ -981,6 +982,51 @@ def test_ocr_language_passes_through(strategy, ocr_func):
|
||||
assert kwargs["lang"] == "kor"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("annots", "height", "coordinate_system", "page_number", "expected"),
|
||||
[
|
||||
(["BS", "BE"], 300, PixelSpace(300, 300), 1, 0),
|
||||
(
|
||||
[
|
||||
{
|
||||
"Type": "/'Annot'",
|
||||
"Subtype": "/'Link'",
|
||||
"A": {
|
||||
"Type": "/'Action'",
|
||||
"S": "/'URI'",
|
||||
"URI": "b'https://layout-parser.github.io'",
|
||||
},
|
||||
"BS": {"S": "/'S'", "W": 1},
|
||||
"Border": [0, 0, 1],
|
||||
"C": [0, 1, 1],
|
||||
"H": "/'I'",
|
||||
"Rect": [304.055, 224.156, 452.472, 234.368],
|
||||
},
|
||||
{
|
||||
"Type": "/'Annot'",
|
||||
"Subtype": "/'Link'",
|
||||
"A": {"S": "/'GoTo'", "D": "b'cite.harley2015evaluation'"},
|
||||
"BS": {"S": "/'S'", "W": 1},
|
||||
"Border": [0, 0, 1],
|
||||
"C": [0, 1, 0],
|
||||
"H": "/'I'",
|
||||
"Rect": [468.305, 128.081, 480.26, 136.494],
|
||||
},
|
||||
],
|
||||
792,
|
||||
PixelSpace(612, 792),
|
||||
1,
|
||||
2,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_get_uris_from_annots_string_annotation(
|
||||
annots, height, coordinate_system, page_number, expected
|
||||
):
|
||||
annotation_list = get_uris_from_annots(annots, height, coordinate_system, page_number)
|
||||
assert len(annotation_list) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
|
||||
@pytest.mark.parametrize(
|
||||
("filename", "is_image"),
|
||||
|
@ -921,8 +921,14 @@ def get_uris_from_annots(
|
||||
"""
|
||||
annotation_list = []
|
||||
for annotation in annots:
|
||||
annotation_dict = try_resolve(annotation)
|
||||
if str(annotation_dict["Subtype"]) != "/'Link'" or "A" not in annotation_dict:
|
||||
annotation_dict = (
|
||||
try_resolve(annotation) if isinstance(try_resolve(annotation), dict) else None
|
||||
)
|
||||
if (
|
||||
not annotation_dict
|
||||
or str(annotation_dict["Subtype"]) != "/'Link'"
|
||||
or "A" not in annotation_dict
|
||||
):
|
||||
continue
|
||||
x1, y1, x2, y2 = rect_to_bbox(annotation_dict["Rect"], height)
|
||||
uri_dict = try_resolve(annotation_dict["A"])
|
||||
|
Loading…
x
Reference in New Issue
Block a user