rfctr(meta): refine @apply_metadata() decorator (#3667)

**Summary**
Refine `@apply_metadata()` replacement decorator. Note it has not been
installed yet.

- Apply `metadata_last_modified` arg with the `@apply_metadata()`
decorator. No need for redundant processing in each partitioner.
- Add "unique-ify" step to fix any cases where the same `Element` or
`ElementMetadata` instance was used more than once in the element
stream. This prevents unexpected "multi-mutation" in downstream
processes.
- Apply "global" metadata items before computing hash-ids. In
particular, `.metadata.filename` is used in the hash computation and
will produce different results if that's not already settled.
- Compute hash-ids _before_ computing `.metadata.parent_id`. This
removes the need for mapping UUID element-ids to their hash counterpart
and doing a fixup of `.parent_id` after applying hash-ids to elements.

**Additional Context**
- The `@apply_metadata()` decorator replaces the four metadata-related
decorators: `@process_metadata()`, `@add_metadata_with_filetype()`,
`@add_metadata()`, and `@add_filetype()`.
- It will be installed on each partitioner in a series of following PRs.
This commit is contained in:
Steve Canny 2024-10-01 14:28:32 -07:00 committed by GitHub
parent 75c4998bc7
commit c05e1babf1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 186 additions and 12 deletions

View File

@ -1,4 +1,4 @@
## 0.15.14-dev5 ## 0.15.14-dev6
### Enhancements ### Enhancements

View File

@ -1,7 +1,10 @@
"""Test-suite for `unstructured.partition.common.metadata` module.""" """Test-suite for `unstructured.partition.common.metadata` module."""
# pyright: reportPrivateUsage=false
from __future__ import annotations from __future__ import annotations
import copy
import datetime as dt import datetime as dt
import os import os
import pathlib import pathlib
@ -22,6 +25,7 @@ from unstructured.documents.elements import (
) )
from unstructured.file_utils.model import FileType from unstructured.file_utils.model import FileType
from unstructured.partition.common.metadata import ( from unstructured.partition.common.metadata import (
_assign_hash_ids,
apply_metadata, apply_metadata,
get_last_modified_date, get_last_modified_date,
set_element_hierarchy, set_element_hierarchy,
@ -127,9 +131,50 @@ def test_set_element_hierarchy_custom_rule_set():
), "FigureCaption should be child of Title 2" ), "FigureCaption should be child of Title 2"
# ================================================================================================
# APPLY METADATA DECORATOR
# ================================================================================================
class Describe_apply_metadata: class Describe_apply_metadata:
"""Unit-test suite for `unstructured.partition.common.metadata.apply_metadata()` decorator.""" """Unit-test suite for `unstructured.partition.common.metadata.apply_metadata()` decorator."""
# -- unique-ify elements and metadata ---------------------------------
def it_produces_unique_elements_and_metadata_when_input_reuses_element_instances(self):
element = Text(text="Element", metadata=ElementMetadata(filename="foo.bar", page_number=1))
def fake_partitioner(**kwargs: Any) -> list[Element]:
return [element, element, element]
partition = apply_metadata()(fake_partitioner)
elements = partition()
# -- all elements are unique instances --
assert len({id(e) for e in elements}) == len(elements)
# -- all metadatas are unique instances --
assert len({id(e.metadata) for e in elements}) == len(elements)
def and_it_produces_unique_elements_and_metadata_when_input_reuses_metadata_instances(self):
metadata = ElementMetadata(filename="foo.bar", page_number=1)
def fake_partitioner(**kwargs: Any) -> list[Element]:
return [
Text(text="foo", metadata=metadata),
Text(text="bar", metadata=metadata),
Text(text="baz", metadata=metadata),
]
partition = apply_metadata()(fake_partitioner)
elements = partition()
# -- all elements are unique instances --
assert len({id(e) for e in elements}) == len(elements)
# -- all metadatas are unique instances --
assert len({id(e.metadata) for e in elements}) == len(elements)
# -- unique-ids ------------------------------------------------------- # -- unique-ids -------------------------------------------------------
def it_assigns_hash_element_ids_when_unique_ids_arg_is_not_specified( def it_assigns_hash_element_ids_when_unique_ids_arg_is_not_specified(
@ -274,6 +319,29 @@ class Describe_apply_metadata:
assert all(e.metadata.filename == "image.jpeg" for e in elements) assert all(e.metadata.filename == "image.jpeg" for e in elements)
assert all(e.metadata.file_directory == "x/y/images" for e in elements) assert all(e.metadata.file_directory == "x/y/images" for e in elements)
# -- last_modified ----------------------------------------------------
def it_uses_metadata_last_modified_arg_value_when_present(
self, fake_partitioner: Callable[..., list[Element]]
):
"""A `metadata_last_modified` arg overrides all other sources."""
partition = apply_metadata()(fake_partitioner)
metadata_last_modified = "2024-09-26T15:17:53"
elements = partition(metadata_last_modified=metadata_last_modified)
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
@pytest.mark.parametrize("kwargs", [{}, {"metadata_last_modified": None}])
def but_it_does_not_update_last_modified_when_metadata_last_modified_arg_absent_or_None(
self, kwargs: dict[str, Any], fake_partitioner: Callable[..., list[Element]]
):
partition = apply_metadata()(fake_partitioner)
elements = partition(**kwargs)
assert all(e.metadata.last_modified == "2020-01-06T05:07:03" for e in elements)
# -- url -------------------------------------------------------------- # -- url --------------------------------------------------------------
def it_assigns_url_metadata_field_when_url_arg_is_present( def it_assigns_url_metadata_field_when_url_arg_is_present(
@ -304,14 +372,42 @@ class Describe_apply_metadata:
title.metadata.file_directory = "x/y/images" title.metadata.file_directory = "x/y/images"
title.metadata.filename = "image.jpeg" title.metadata.filename = "image.jpeg"
title.metadata.filetype = "image/jpeg" title.metadata.filetype = "image/jpeg"
title.metadata.last_modified = "2020-01-06T05:07:03"
title.metadata.url = "http://images.com" title.metadata.url = "http://images.com"
narr_text = NarrativeText("To understand bar you must first understand foo.") narr_text = NarrativeText("To understand bar you must first understand foo.")
narr_text.metadata.file_directory = "x/y/images" narr_text.metadata.file_directory = "x/y/images"
narr_text.metadata.filename = "image.jpeg" narr_text.metadata.filename = "image.jpeg"
narr_text.metadata.filetype = "image/jpeg" narr_text.metadata.filetype = "image/jpeg"
narr_text.metadata.last_modified = "2020-01-06T05:07:03"
narr_text.metadata.url = "http://images.com" narr_text.metadata.url = "http://images.com"
return [title, narr_text] return [title, narr_text]
return fake_partitioner return fake_partitioner
# ================================================================================================
# HASH IDS
# ================================================================================================
def test_assign_hash_ids_produces_unique_and_deterministic_SHA1_ids_even_for_duplicate_elements():
elements: list[Element] = [
Text(text="Element", metadata=ElementMetadata(filename="foo.bar", page_number=1)),
Text(text="Element", metadata=ElementMetadata(filename="foo.bar", page_number=1)),
Text(text="Element", metadata=ElementMetadata(filename="foo.bar", page_number=1)),
]
# -- default ids are UUIDs --
assert all(len(e.id) == 36 for e in elements)
elements = _assign_hash_ids(copy.deepcopy(elements))
elements_2 = _assign_hash_ids(copy.deepcopy(elements))
ids = [e.id for e in elements]
# -- ids are now SHA1 --
assert all(len(e.id) == 32 for e in elements)
# -- each id is unique --
assert len(ids) == len(set(ids))
# -- ids are deterministic, same value is computed each time --
assert all(e.id == e2.id for e, e2 in zip(elements, elements_2))

View File

@ -1 +1 @@
__version__ = "0.15.14-dev5" # pragma: no cover __version__ = "0.15.14-dev6" # pragma: no cover

View File

@ -2,14 +2,16 @@
from __future__ import annotations from __future__ import annotations
import copy
import datetime as dt import datetime as dt
import functools import functools
import itertools
import os import os
from typing import Any, Callable, Sequence from typing import Any, Callable, Iterator, Sequence
from typing_extensions import ParamSpec from typing_extensions import ParamSpec
from unstructured.documents.elements import Element, ElementMetadata, assign_and_map_hash_ids from unstructured.documents.elements import Element, ElementMetadata
from unstructured.file_utils.model import FileType from unstructured.file_utils.model import FileType
from unstructured.partition.common.lang import apply_lang_metadata from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.utils import get_call_args_applying_defaults from unstructured.utils import get_call_args_applying_defaults
@ -139,6 +141,8 @@ def apply_metadata(
- Replace `filename` with `metadata_filename` when present. - Replace `filename` with `metadata_filename` when present.
- Replace `last_modified` with `metadata_last_modified` when present.
- Apply `url` metadata when present. - Apply `url` metadata when present.
""" """
@ -155,14 +159,21 @@ def apply_metadata(
elements = func(*args, **kwargs) elements = func(*args, **kwargs)
call_args = get_call_args_applying_defaults(func, *args, **kwargs) call_args = get_call_args_applying_defaults(func, *args, **kwargs)
# -- Compute and apply hash-ids if the user does not want UUIDs. Note this changes the # ------------------------------------------------------------------------------------
# -- elements themselves, not the metadata. # unique-ify elements
unique_element_ids: bool = call_args.get("unique_element_ids", False) # ------------------------------------------------------------------------------------
if unique_element_ids is False: # Do this first to ensure all following operations behave as expected. It's easy for a
elements = assign_and_map_hash_ids(elements) # partitioner to re-use an element or metadata instance when its values are common to
# multiple elements. This can lead to very hard-to diagnose bugs downstream when
# mutating one element unexpectedly also mutates others (because they are the same
# instance).
# ------------------------------------------------------------------------------------
# -- `parent_id` - process category-level etc. to assign parent-id -- elements = _uniqueify_elements_and_metadata(elements)
elements = set_element_hierarchy(elements)
# ------------------------------------------------------------------------------------
# apply metadata - do this first because it affects the hash computation.
# ------------------------------------------------------------------------------------
# -- `language` - auto-detect language (e.g. eng, spa) -- # -- `language` - auto-detect language (e.g. eng, spa) --
languages = call_args.get("languages") languages = call_args.get("languages")
@ -175,7 +186,7 @@ def apply_metadata(
) )
) )
# == apply filetype, filename, and url metadata ========================= # == apply filetype, filename, last_modified, and url metadata ===================
metadata_kwargs: dict[str, Any] = {} metadata_kwargs: dict[str, Any] = {}
# -- `filetype` (MIME-type) metadata -- # -- `filetype` (MIME-type) metadata --
@ -188,6 +199,11 @@ def apply_metadata(
if filename: if filename:
metadata_kwargs["filename"] = filename metadata_kwargs["filename"] = filename
# -- `last_modified` metadata - override with metadata_last_modified when present --
metadata_last_modified = call_args.get("metadata_last_modified")
if metadata_last_modified:
metadata_kwargs["last_modified"] = metadata_last_modified
# -- `url` metadata - record url when present -- # -- `url` metadata - record url when present --
url = call_args.get("url") url = call_args.get("url")
if url: if url:
@ -201,8 +217,70 @@ def apply_metadata(
continue continue
element.metadata.update(ElementMetadata(**metadata_kwargs)) element.metadata.update(ElementMetadata(**metadata_kwargs))
# ------------------------------------------------------------------------------------
# compute hash ids (when so requestsd)
# ------------------------------------------------------------------------------------
# -- Compute and apply hash-ids if the user does not want UUIDs. Note this mutates the
# -- elements themselves, not their metadata.
unique_element_ids: bool = call_args.get("unique_element_ids", False)
if unique_element_ids is False:
elements = _assign_hash_ids(elements)
# ------------------------------------------------------------------------------------
# assign parent-id - do this after hash computation so parent-id is stable.
# ------------------------------------------------------------------------------------
# -- `parent_id` - process category-level etc. to assign parent-id --
elements = set_element_hierarchy(elements)
return elements return elements
return wrapper return wrapper
return decorator return decorator
def _assign_hash_ids(elements: list[Element]) -> list[Element]:
"""Converts `.id` of each element from UUID to hash.
The hash is based on the `.text` of the element, but also on its page-number and sequence number
on that page. This provides for deterministic results even when the document is split into one
or more fragments for parallel processing.
"""
# -- generate sequence number for each element on a page --
page_numbers = [e.metadata.page_number for e in elements]
page_seq_numbers = [
seq_on_page
for _, group in itertools.groupby(page_numbers)
for seq_on_page, _ in enumerate(group)
]
for element, seq_on_page_counter in zip(elements, page_seq_numbers):
element.id_to_hash(seq_on_page_counter)
return elements
def _uniqueify_elements_and_metadata(elements: list[Element]) -> list[Element]:
"""Ensure each of `elements` and their metadata are unique instances.
This prevents hard-to-diagnose bugs downstream when mutating one element unexpectedly also
mutates others because they are the same instance.
"""
def iter_unique_elements(elements: list[Element]) -> Iterator[Element]:
"""Substitute deep-copies of any non-unique elements or metadata in `elements`."""
seen_elements: set[int] = set()
seen_metadata: set[int] = set()
for element in elements:
if id(element) in seen_elements:
element = copy.deepcopy(element)
if id(element.metadata) in seen_metadata:
element.metadata = copy.deepcopy(element.metadata)
seen_elements.add(id(element))
seen_metadata.add(id(element.metadata))
yield element
return list(iter_unique_elements(elements))