mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-09 18:15:55 +00:00
fix: type error string indices bug (#1940)
Fix TypeError: string indices must be integers. The `annotation_dict` variable is conditioned to be `None` if instance type is not dict. Then we add logic to skip the attempt if the value is `None`.
This commit is contained in:
parent
c3e42e9ffc
commit
a11d4634f1
@ -23,6 +23,7 @@
|
|||||||
* **Fix ingest pipeline to be able to use chunking and embedding together** Problem: When ingest pipeline was using chunking and embedding together, embedding outputs were empty and the outputs of chunking couldn't be re-read into memory and be forwarded to embeddings. Fix: Added CompositeElement type to TYPE_TO_TEXT_ELEMENT_MAP to be able to process CompositeElements with unstructured.staging.base.isd_to_elements
|
* **Fix ingest pipeline to be able to use chunking and embedding together** Problem: When ingest pipeline was using chunking and embedding together, embedding outputs were empty and the outputs of chunking couldn't be re-read into memory and be forwarded to embeddings. Fix: Added CompositeElement type to TYPE_TO_TEXT_ELEMENT_MAP to be able to process CompositeElements with unstructured.staging.base.isd_to_elements
|
||||||
* **Fix unnecessary mid-text chunk-splitting.** The "pre-chunker" did not consider separator blank-line ("\n\n") length when grouping elements for a single chunk. As a result, sections were frequently over-populated producing a over-sized chunk that required mid-text splitting.
|
* **Fix unnecessary mid-text chunk-splitting.** The "pre-chunker" did not consider separator blank-line ("\n\n") length when grouping elements for a single chunk. As a result, sections were frequently over-populated producing a over-sized chunk that required mid-text splitting.
|
||||||
* **Fix frequent dissociation of title from chunk.** The sectioning algorithm included the title of the next section with the prior section whenever it would fit, frequently producing association of a section title with the prior section and dissociating it from its actual section. Fix this by performing combination of whole sections only.
|
* **Fix frequent dissociation of title from chunk.** The sectioning algorithm included the title of the next section with the prior section whenever it would fit, frequently producing association of a section title with the prior section and dissociating it from its actual section. Fix this by performing combination of whole sections only.
|
||||||
|
* **Fix PDF attempt to get dict value from string.** Fixes a rare edge case that prevented some PDF's from being partitioned. The `get_uris_from_annots` function tried to access the dictionary value of a string instance variable. Assign `None` to the annotation variable if the instance type is not dictionary to avoid the erroneous attempt.
|
||||||
|
|
||||||
## 0.10.27
|
## 0.10.27
|
||||||
|
|
||||||
|
@ -19,6 +19,7 @@ from unstructured.documents.elements import (
|
|||||||
Title,
|
Title,
|
||||||
)
|
)
|
||||||
from unstructured.partition import ocr, pdf, strategies
|
from unstructured.partition import ocr, pdf, strategies
|
||||||
|
from unstructured.partition.pdf import get_uris_from_annots
|
||||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||||
|
|
||||||
|
|
||||||
@ -981,6 +982,51 @@ def test_ocr_language_passes_through(strategy, ocr_func):
|
|||||||
assert kwargs["lang"] == "kor"
|
assert kwargs["lang"] == "kor"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("annots", "height", "coordinate_system", "page_number", "expected"),
|
||||||
|
[
|
||||||
|
(["BS", "BE"], 300, PixelSpace(300, 300), 1, 0),
|
||||||
|
(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"Type": "/'Annot'",
|
||||||
|
"Subtype": "/'Link'",
|
||||||
|
"A": {
|
||||||
|
"Type": "/'Action'",
|
||||||
|
"S": "/'URI'",
|
||||||
|
"URI": "b'https://layout-parser.github.io'",
|
||||||
|
},
|
||||||
|
"BS": {"S": "/'S'", "W": 1},
|
||||||
|
"Border": [0, 0, 1],
|
||||||
|
"C": [0, 1, 1],
|
||||||
|
"H": "/'I'",
|
||||||
|
"Rect": [304.055, 224.156, 452.472, 234.368],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Type": "/'Annot'",
|
||||||
|
"Subtype": "/'Link'",
|
||||||
|
"A": {"S": "/'GoTo'", "D": "b'cite.harley2015evaluation'"},
|
||||||
|
"BS": {"S": "/'S'", "W": 1},
|
||||||
|
"Border": [0, 0, 1],
|
||||||
|
"C": [0, 1, 0],
|
||||||
|
"H": "/'I'",
|
||||||
|
"Rect": [468.305, 128.081, 480.26, 136.494],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
792,
|
||||||
|
PixelSpace(612, 792),
|
||||||
|
1,
|
||||||
|
2,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_get_uris_from_annots_string_annotation(
|
||||||
|
annots, height, coordinate_system, page_number, expected
|
||||||
|
):
|
||||||
|
annotation_list = get_uris_from_annots(annots, height, coordinate_system, page_number)
|
||||||
|
assert len(annotation_list) == expected
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
|
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("filename", "is_image"),
|
("filename", "is_image"),
|
||||||
|
@ -921,8 +921,14 @@ def get_uris_from_annots(
|
|||||||
"""
|
"""
|
||||||
annotation_list = []
|
annotation_list = []
|
||||||
for annotation in annots:
|
for annotation in annots:
|
||||||
annotation_dict = try_resolve(annotation)
|
annotation_dict = (
|
||||||
if str(annotation_dict["Subtype"]) != "/'Link'" or "A" not in annotation_dict:
|
try_resolve(annotation) if isinstance(try_resolve(annotation), dict) else None
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
not annotation_dict
|
||||||
|
or str(annotation_dict["Subtype"]) != "/'Link'"
|
||||||
|
or "A" not in annotation_dict
|
||||||
|
):
|
||||||
continue
|
continue
|
||||||
x1, y1, x2, y2 = rect_to_bbox(annotation_dict["Rect"], height)
|
x1, y1, x2, y2 = rect_to_bbox(annotation_dict["Rect"], height)
|
||||||
uri_dict = try_resolve(annotation_dict["A"])
|
uri_dict = try_resolve(annotation_dict["A"])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user