mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-26 00:40:04 +00:00
fix: fix occasional key error when mapping parent id (#3658)
This PR fixes an occasional `KeyError` when calling `assign_and_map_hash_ids`. - This happens when the input `elements` has duplicated element instances or metadata. - When there are duplications the logic to iterate through all elements and map their parent ids will raise an error when an already mapped parent id is up for mapping. - The fix adds a logic to check if the parent id exists in `old_to_new_mapping` and if it doesn't we skip mapping it ## test This PR adds a unit test on this case and the test would fail without the fix.
This commit is contained in:
parent
6428d19e5a
commit
903efb0c6d
@ -8,6 +8,7 @@
|
||||
|
||||
* **Update Python SDK usage in `partition_via_api`.** Make a minor syntax change to ensure forward compatibility with the upcoming 0.26.0 Python SDK.
|
||||
* **Remove "unused" `date_from_file_object` parameter.** As part of simplifying partitioning parameter set, remove `date_from_file_object` parameter. A file object does not have a last-modified date attribute so can never give a useful value. When a file-object is used as the document source (such as in Unstructured API) the last-modified date must come from the `metadata_last_modified` argument.
|
||||
* **Fix occasional `KeyError` when mapping parent ids to hash ids.** Occasionally the input elements into `assign_and_map_hash_ids` can contain duplicated element instances, which lead to error when mapping parent id.
|
||||
|
||||
## 0.15.13
|
||||
|
||||
|
@ -704,6 +704,21 @@ def test_hash_ids_are_unique_for_duplicate_elements():
|
||||
), "Parent ID hasn't changed after recalculation"
|
||||
|
||||
|
||||
def test_hash_ids_can_handle_duplicated_element_instances():
|
||||
# GIVEN
|
||||
parent = Text(text="Parent", metadata=ElementMetadata(page_number=1))
|
||||
element = Text(text="Element", metadata=ElementMetadata(page_number=1, parent_id=parent.id))
|
||||
elements = [parent, element, element]
|
||||
|
||||
# WHEN
|
||||
updated_elements = assign_and_map_hash_ids(copy.deepcopy(elements))
|
||||
ids = [element.id for element in updated_elements]
|
||||
|
||||
# THEN
|
||||
assert len(ids) == len(set(ids)) + 1, "One element is duplicated so uniques should be one less."
|
||||
assert elements[1].metadata.parent_id == elements[2].metadata.parent_id
|
||||
|
||||
|
||||
def test_hash_ids_are_deterministic():
|
||||
parent = Text(text="Parent", metadata=ElementMetadata(page_number=1))
|
||||
elements = [
|
||||
|
@ -562,7 +562,7 @@ def assign_and_map_hash_ids(elements: list[Element]) -> list[Element]:
|
||||
# -- map old parent IDs to new ones --
|
||||
for e in elements:
|
||||
parent_id = e.metadata.parent_id
|
||||
if not parent_id:
|
||||
if not parent_id or parent_id not in old_to_new_mapping:
|
||||
continue
|
||||
e.metadata.parent_id = old_to_new_mapping[parent_id]
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user