From c05e1babf1d84f3e09f1ba2da034ec9ae80643a9 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Tue, 1 Oct 2024 14:28:32 -0700 Subject: [PATCH] rfctr(meta): refine @apply_metadata() decorator (#3667) **Summary** Refine `@apply_metadata()` replacement decorator. Note it has not been installed yet. - Apply `metadata_last_modified` arg with the `@apply_metadata()` decorator. No need for redundant processing in each partitioner. - Add "unique-ify" step to fix any cases where the same `Element` or `ElementMetadata` instance was used more than once in the element stream. This prevents unexpected "multi-mutation" in downstream processes. - Apply "global" metadata items before computing hash-ids. In particular, `.metadata.filename` is used in the hash computation and will produce different results if that's not already settled. - Compute hash-ids _before_ computing `.metadata.parent_id`. This removes the need for mapping UUID element-ids to their hash counterpart and doing a fixup of `.parent_id` after applying hash-ids to elements. **Additional Context** - The `@apply_metadata()` decorator replaces the four metadata-related decorators: `@process_metadata()`, `@add_metadata_with_filetype()`, `@add_metadata()`, and `@add_filetype()`. - It will be installed on each partitioner in a series of following PRs. --- CHANGELOG.md | 2 +- .../partition/common/test_metadata.py | 96 ++++++++++++++++++ unstructured/__version__.py | 2 +- unstructured/partition/common/metadata.py | 98 +++++++++++++++++-- 4 files changed, 186 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4be12648c..a72eaa56a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.14-dev5 +## 0.15.14-dev6 ### Enhancements diff --git a/test_unstructured/partition/common/test_metadata.py b/test_unstructured/partition/common/test_metadata.py index 345e55ae6..f4096bb62 100644 --- a/test_unstructured/partition/common/test_metadata.py +++ b/test_unstructured/partition/common/test_metadata.py @@ -1,7 +1,10 @@ """Test-suite for `unstructured.partition.common.metadata` module.""" +# pyright: reportPrivateUsage=false + from __future__ import annotations +import copy import datetime as dt import os import pathlib @@ -22,6 +25,7 @@ from unstructured.documents.elements import ( ) from unstructured.file_utils.model import FileType from unstructured.partition.common.metadata import ( + _assign_hash_ids, apply_metadata, get_last_modified_date, set_element_hierarchy, @@ -127,9 +131,50 @@ def test_set_element_hierarchy_custom_rule_set(): ), "FigureCaption should be child of Title 2" +# ================================================================================================ +# APPLY METADATA DECORATOR +# ================================================================================================ + + class Describe_apply_metadata: """Unit-test suite for `unstructured.partition.common.metadata.apply_metadata()` decorator.""" + # -- unique-ify elements and metadata --------------------------------- + + def it_produces_unique_elements_and_metadata_when_input_reuses_element_instances(self): + element = Text(text="Element", metadata=ElementMetadata(filename="foo.bar", page_number=1)) + + def fake_partitioner(**kwargs: Any) -> list[Element]: + return [element, element, element] + + partition = apply_metadata()(fake_partitioner) + + elements = partition() + + # -- all elements are unique instances -- + assert len({id(e) for e in elements}) == len(elements) + # -- all metadatas are unique instances -- + assert len({id(e.metadata) for e in elements}) == len(elements) + + def and_it_produces_unique_elements_and_metadata_when_input_reuses_metadata_instances(self): + metadata = ElementMetadata(filename="foo.bar", page_number=1) + + def fake_partitioner(**kwargs: Any) -> list[Element]: + return [ + Text(text="foo", metadata=metadata), + Text(text="bar", metadata=metadata), + Text(text="baz", metadata=metadata), + ] + + partition = apply_metadata()(fake_partitioner) + + elements = partition() + + # -- all elements are unique instances -- + assert len({id(e) for e in elements}) == len(elements) + # -- all metadatas are unique instances -- + assert len({id(e.metadata) for e in elements}) == len(elements) + # -- unique-ids ------------------------------------------------------- def it_assigns_hash_element_ids_when_unique_ids_arg_is_not_specified( @@ -274,6 +319,29 @@ class Describe_apply_metadata: assert all(e.metadata.filename == "image.jpeg" for e in elements) assert all(e.metadata.file_directory == "x/y/images" for e in elements) + # -- last_modified ---------------------------------------------------- + + def it_uses_metadata_last_modified_arg_value_when_present( + self, fake_partitioner: Callable[..., list[Element]] + ): + """A `metadata_last_modified` arg overrides all other sources.""" + partition = apply_metadata()(fake_partitioner) + metadata_last_modified = "2024-09-26T15:17:53" + + elements = partition(metadata_last_modified=metadata_last_modified) + + assert all(e.metadata.last_modified == metadata_last_modified for e in elements) + + @pytest.mark.parametrize("kwargs", [{}, {"metadata_last_modified": None}]) + def but_it_does_not_update_last_modified_when_metadata_last_modified_arg_absent_or_None( + self, kwargs: dict[str, Any], fake_partitioner: Callable[..., list[Element]] + ): + partition = apply_metadata()(fake_partitioner) + + elements = partition(**kwargs) + + assert all(e.metadata.last_modified == "2020-01-06T05:07:03" for e in elements) + # -- url -------------------------------------------------------------- def it_assigns_url_metadata_field_when_url_arg_is_present( @@ -304,14 +372,42 @@ class Describe_apply_metadata: title.metadata.file_directory = "x/y/images" title.metadata.filename = "image.jpeg" title.metadata.filetype = "image/jpeg" + title.metadata.last_modified = "2020-01-06T05:07:03" title.metadata.url = "http://images.com" narr_text = NarrativeText("To understand bar you must first understand foo.") narr_text.metadata.file_directory = "x/y/images" narr_text.metadata.filename = "image.jpeg" narr_text.metadata.filetype = "image/jpeg" + narr_text.metadata.last_modified = "2020-01-06T05:07:03" narr_text.metadata.url = "http://images.com" return [title, narr_text] return fake_partitioner + + +# ================================================================================================ +# HASH IDS +# ================================================================================================ + + +def test_assign_hash_ids_produces_unique_and_deterministic_SHA1_ids_even_for_duplicate_elements(): + elements: list[Element] = [ + Text(text="Element", metadata=ElementMetadata(filename="foo.bar", page_number=1)), + Text(text="Element", metadata=ElementMetadata(filename="foo.bar", page_number=1)), + Text(text="Element", metadata=ElementMetadata(filename="foo.bar", page_number=1)), + ] + # -- default ids are UUIDs -- + assert all(len(e.id) == 36 for e in elements) + + elements = _assign_hash_ids(copy.deepcopy(elements)) + elements_2 = _assign_hash_ids(copy.deepcopy(elements)) + + ids = [e.id for e in elements] + # -- ids are now SHA1 -- + assert all(len(e.id) == 32 for e in elements) + # -- each id is unique -- + assert len(ids) == len(set(ids)) + # -- ids are deterministic, same value is computed each time -- + assert all(e.id == e2.id for e, e2 in zip(elements, elements_2)) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 5809122b1..3789b249a 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.14-dev5" # pragma: no cover +__version__ = "0.15.14-dev6" # pragma: no cover diff --git a/unstructured/partition/common/metadata.py b/unstructured/partition/common/metadata.py index 7de1909a3..11315b47a 100644 --- a/unstructured/partition/common/metadata.py +++ b/unstructured/partition/common/metadata.py @@ -2,14 +2,16 @@ from __future__ import annotations +import copy import datetime as dt import functools +import itertools import os -from typing import Any, Callable, Sequence +from typing import Any, Callable, Iterator, Sequence from typing_extensions import ParamSpec -from unstructured.documents.elements import Element, ElementMetadata, assign_and_map_hash_ids +from unstructured.documents.elements import Element, ElementMetadata from unstructured.file_utils.model import FileType from unstructured.partition.common.lang import apply_lang_metadata from unstructured.utils import get_call_args_applying_defaults @@ -139,6 +141,8 @@ def apply_metadata( - Replace `filename` with `metadata_filename` when present. + - Replace `last_modified` with `metadata_last_modified` when present. + - Apply `url` metadata when present. """ @@ -155,14 +159,21 @@ def apply_metadata( elements = func(*args, **kwargs) call_args = get_call_args_applying_defaults(func, *args, **kwargs) - # -- Compute and apply hash-ids if the user does not want UUIDs. Note this changes the - # -- elements themselves, not the metadata. - unique_element_ids: bool = call_args.get("unique_element_ids", False) - if unique_element_ids is False: - elements = assign_and_map_hash_ids(elements) + # ------------------------------------------------------------------------------------ + # unique-ify elements + # ------------------------------------------------------------------------------------ + # Do this first to ensure all following operations behave as expected. It's easy for a + # partitioner to re-use an element or metadata instance when its values are common to + # multiple elements. This can lead to very hard-to diagnose bugs downstream when + # mutating one element unexpectedly also mutates others (because they are the same + # instance). + # ------------------------------------------------------------------------------------ - # -- `parent_id` - process category-level etc. to assign parent-id -- - elements = set_element_hierarchy(elements) + elements = _uniqueify_elements_and_metadata(elements) + + # ------------------------------------------------------------------------------------ + # apply metadata - do this first because it affects the hash computation. + # ------------------------------------------------------------------------------------ # -- `language` - auto-detect language (e.g. eng, spa) -- languages = call_args.get("languages") @@ -175,7 +186,7 @@ def apply_metadata( ) ) - # == apply filetype, filename, and url metadata ========================= + # == apply filetype, filename, last_modified, and url metadata =================== metadata_kwargs: dict[str, Any] = {} # -- `filetype` (MIME-type) metadata -- @@ -188,6 +199,11 @@ def apply_metadata( if filename: metadata_kwargs["filename"] = filename + # -- `last_modified` metadata - override with metadata_last_modified when present -- + metadata_last_modified = call_args.get("metadata_last_modified") + if metadata_last_modified: + metadata_kwargs["last_modified"] = metadata_last_modified + # -- `url` metadata - record url when present -- url = call_args.get("url") if url: @@ -201,8 +217,70 @@ def apply_metadata( continue element.metadata.update(ElementMetadata(**metadata_kwargs)) + # ------------------------------------------------------------------------------------ + # compute hash ids (when so requestsd) + # ------------------------------------------------------------------------------------ + + # -- Compute and apply hash-ids if the user does not want UUIDs. Note this mutates the + # -- elements themselves, not their metadata. + unique_element_ids: bool = call_args.get("unique_element_ids", False) + if unique_element_ids is False: + elements = _assign_hash_ids(elements) + + # ------------------------------------------------------------------------------------ + # assign parent-id - do this after hash computation so parent-id is stable. + # ------------------------------------------------------------------------------------ + + # -- `parent_id` - process category-level etc. to assign parent-id -- + elements = set_element_hierarchy(elements) + return elements return wrapper return decorator + + +def _assign_hash_ids(elements: list[Element]) -> list[Element]: + """Converts `.id` of each element from UUID to hash. + + The hash is based on the `.text` of the element, but also on its page-number and sequence number + on that page. This provides for deterministic results even when the document is split into one + or more fragments for parallel processing. + """ + # -- generate sequence number for each element on a page -- + page_numbers = [e.metadata.page_number for e in elements] + page_seq_numbers = [ + seq_on_page + for _, group in itertools.groupby(page_numbers) + for seq_on_page, _ in enumerate(group) + ] + + for element, seq_on_page_counter in zip(elements, page_seq_numbers): + element.id_to_hash(seq_on_page_counter) + + return elements + + +def _uniqueify_elements_and_metadata(elements: list[Element]) -> list[Element]: + """Ensure each of `elements` and their metadata are unique instances. + + This prevents hard-to-diagnose bugs downstream when mutating one element unexpectedly also + mutates others because they are the same instance. + """ + + def iter_unique_elements(elements: list[Element]) -> Iterator[Element]: + """Substitute deep-copies of any non-unique elements or metadata in `elements`.""" + seen_elements: set[int] = set() + seen_metadata: set[int] = set() + + for element in elements: + if id(element) in seen_elements: + element = copy.deepcopy(element) + if id(element.metadata) in seen_metadata: + element.metadata = copy.deepcopy(element.metadata) + seen_elements.add(id(element)) + seen_metadata.add(id(element.metadata)) + yield element + + return list(iter_unique_elements(elements))