diff --git a/CHANGELOG.md b/CHANGELOG.md index b4eea0e57..158f6e24c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ * **Update Python SDK usage in `partition_via_api`.** Make a minor syntax change to ensure forward compatibility with the upcoming 0.26.0 Python SDK. * **Remove "unused" `date_from_file_object` parameter.** As part of simplifying partitioning parameter set, remove `date_from_file_object` parameter. A file object does not have a last-modified date attribute so can never give a useful value. When a file-object is used as the document source (such as in Unstructured API) the last-modified date must come from the `metadata_last_modified` argument. +* **Fix occasional `KeyError` when mapping parent ids to hash ids.** Occasionally the input elements into `assign_and_map_hash_ids` can contain duplicated element instances, which lead to error when mapping parent id. ## 0.15.13 diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py index 7b3ff7360..298d6bed8 100644 --- a/test_unstructured/documents/test_elements.py +++ b/test_unstructured/documents/test_elements.py @@ -704,6 +704,21 @@ def test_hash_ids_are_unique_for_duplicate_elements(): ), "Parent ID hasn't changed after recalculation" +def test_hash_ids_can_handle_duplicated_element_instances(): + # GIVEN + parent = Text(text="Parent", metadata=ElementMetadata(page_number=1)) + element = Text(text="Element", metadata=ElementMetadata(page_number=1, parent_id=parent.id)) + elements = [parent, element, element] + + # WHEN + updated_elements = assign_and_map_hash_ids(copy.deepcopy(elements)) + ids = [element.id for element in updated_elements] + + # THEN + assert len(ids) == len(set(ids)) + 1, "One element is duplicated so uniques should be one less." + assert elements[1].metadata.parent_id == elements[2].metadata.parent_id + + def test_hash_ids_are_deterministic(): parent = Text(text="Parent", metadata=ElementMetadata(page_number=1)) elements = [ diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index c1b29ee6d..07971b806 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -562,7 +562,7 @@ def assign_and_map_hash_ids(elements: list[Element]) -> list[Element]: # -- map old parent IDs to new ones -- for e in elements: parent_id = e.metadata.parent_id - if not parent_id: + if not parent_id or parent_id not in old_to_new_mapping: continue e.metadata.parent_id = old_to_new_mapping[parent_id]