mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-03 19:43:24 +00:00 
			
		
		
		
	rfctr(meta): refine @apply_metadata() decorator (#3667)
**Summary** Refine `@apply_metadata()` replacement decorator. Note it has not been installed yet. - Apply `metadata_last_modified` arg with the `@apply_metadata()` decorator. No need for redundant processing in each partitioner. - Add "unique-ify" step to fix any cases where the same `Element` or `ElementMetadata` instance was used more than once in the element stream. This prevents unexpected "multi-mutation" in downstream processes. - Apply "global" metadata items before computing hash-ids. In particular, `.metadata.filename` is used in the hash computation and will produce different results if that's not already settled. - Compute hash-ids _before_ computing `.metadata.parent_id`. This removes the need for mapping UUID element-ids to their hash counterpart and doing a fixup of `.parent_id` after applying hash-ids to elements. **Additional Context** - The `@apply_metadata()` decorator replaces the four metadata-related decorators: `@process_metadata()`, `@add_metadata_with_filetype()`, `@add_metadata()`, and `@add_filetype()`. - It will be installed on each partitioner in a series of following PRs.
This commit is contained in:
		
							parent
							
								
									75c4998bc7
								
							
						
					
					
						commit
						c05e1babf1
					
				@ -1,4 +1,4 @@
 | 
			
		||||
## 0.15.14-dev5
 | 
			
		||||
## 0.15.14-dev6
 | 
			
		||||
 | 
			
		||||
### Enhancements
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,7 +1,10 @@
 | 
			
		||||
"""Test-suite for `unstructured.partition.common.metadata` module."""
 | 
			
		||||
 | 
			
		||||
# pyright: reportPrivateUsage=false
 | 
			
		||||
 | 
			
		||||
from __future__ import annotations
 | 
			
		||||
 | 
			
		||||
import copy
 | 
			
		||||
import datetime as dt
 | 
			
		||||
import os
 | 
			
		||||
import pathlib
 | 
			
		||||
@ -22,6 +25,7 @@ from unstructured.documents.elements import (
 | 
			
		||||
)
 | 
			
		||||
from unstructured.file_utils.model import FileType
 | 
			
		||||
from unstructured.partition.common.metadata import (
 | 
			
		||||
    _assign_hash_ids,
 | 
			
		||||
    apply_metadata,
 | 
			
		||||
    get_last_modified_date,
 | 
			
		||||
    set_element_hierarchy,
 | 
			
		||||
@ -127,9 +131,50 @@ def test_set_element_hierarchy_custom_rule_set():
 | 
			
		||||
    ), "FigureCaption should be child of Title 2"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ================================================================================================
 | 
			
		||||
# APPLY METADATA DECORATOR
 | 
			
		||||
# ================================================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Describe_apply_metadata:
 | 
			
		||||
    """Unit-test suite for `unstructured.partition.common.metadata.apply_metadata()` decorator."""
 | 
			
		||||
 | 
			
		||||
    # -- unique-ify elements and metadata ---------------------------------
 | 
			
		||||
 | 
			
		||||
    def it_produces_unique_elements_and_metadata_when_input_reuses_element_instances(self):
 | 
			
		||||
        element = Text(text="Element", metadata=ElementMetadata(filename="foo.bar", page_number=1))
 | 
			
		||||
 | 
			
		||||
        def fake_partitioner(**kwargs: Any) -> list[Element]:
 | 
			
		||||
            return [element, element, element]
 | 
			
		||||
 | 
			
		||||
        partition = apply_metadata()(fake_partitioner)
 | 
			
		||||
 | 
			
		||||
        elements = partition()
 | 
			
		||||
 | 
			
		||||
        # -- all elements are unique instances --
 | 
			
		||||
        assert len({id(e) for e in elements}) == len(elements)
 | 
			
		||||
        # -- all metadatas are unique instances --
 | 
			
		||||
        assert len({id(e.metadata) for e in elements}) == len(elements)
 | 
			
		||||
 | 
			
		||||
    def and_it_produces_unique_elements_and_metadata_when_input_reuses_metadata_instances(self):
 | 
			
		||||
        metadata = ElementMetadata(filename="foo.bar", page_number=1)
 | 
			
		||||
 | 
			
		||||
        def fake_partitioner(**kwargs: Any) -> list[Element]:
 | 
			
		||||
            return [
 | 
			
		||||
                Text(text="foo", metadata=metadata),
 | 
			
		||||
                Text(text="bar", metadata=metadata),
 | 
			
		||||
                Text(text="baz", metadata=metadata),
 | 
			
		||||
            ]
 | 
			
		||||
 | 
			
		||||
        partition = apply_metadata()(fake_partitioner)
 | 
			
		||||
 | 
			
		||||
        elements = partition()
 | 
			
		||||
 | 
			
		||||
        # -- all elements are unique instances --
 | 
			
		||||
        assert len({id(e) for e in elements}) == len(elements)
 | 
			
		||||
        # -- all metadatas are unique instances --
 | 
			
		||||
        assert len({id(e.metadata) for e in elements}) == len(elements)
 | 
			
		||||
 | 
			
		||||
    # -- unique-ids -------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
    def it_assigns_hash_element_ids_when_unique_ids_arg_is_not_specified(
 | 
			
		||||
@ -274,6 +319,29 @@ class Describe_apply_metadata:
 | 
			
		||||
        assert all(e.metadata.filename == "image.jpeg" for e in elements)
 | 
			
		||||
        assert all(e.metadata.file_directory == "x/y/images" for e in elements)
 | 
			
		||||
 | 
			
		||||
    # -- last_modified ----------------------------------------------------
 | 
			
		||||
 | 
			
		||||
    def it_uses_metadata_last_modified_arg_value_when_present(
 | 
			
		||||
        self, fake_partitioner: Callable[..., list[Element]]
 | 
			
		||||
    ):
 | 
			
		||||
        """A `metadata_last_modified` arg overrides all other sources."""
 | 
			
		||||
        partition = apply_metadata()(fake_partitioner)
 | 
			
		||||
        metadata_last_modified = "2024-09-26T15:17:53"
 | 
			
		||||
 | 
			
		||||
        elements = partition(metadata_last_modified=metadata_last_modified)
 | 
			
		||||
 | 
			
		||||
        assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
 | 
			
		||||
 | 
			
		||||
    @pytest.mark.parametrize("kwargs", [{}, {"metadata_last_modified": None}])
 | 
			
		||||
    def but_it_does_not_update_last_modified_when_metadata_last_modified_arg_absent_or_None(
 | 
			
		||||
        self, kwargs: dict[str, Any], fake_partitioner: Callable[..., list[Element]]
 | 
			
		||||
    ):
 | 
			
		||||
        partition = apply_metadata()(fake_partitioner)
 | 
			
		||||
 | 
			
		||||
        elements = partition(**kwargs)
 | 
			
		||||
 | 
			
		||||
        assert all(e.metadata.last_modified == "2020-01-06T05:07:03" for e in elements)
 | 
			
		||||
 | 
			
		||||
    # -- url --------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
    def it_assigns_url_metadata_field_when_url_arg_is_present(
 | 
			
		||||
@ -304,14 +372,42 @@ class Describe_apply_metadata:
 | 
			
		||||
            title.metadata.file_directory = "x/y/images"
 | 
			
		||||
            title.metadata.filename = "image.jpeg"
 | 
			
		||||
            title.metadata.filetype = "image/jpeg"
 | 
			
		||||
            title.metadata.last_modified = "2020-01-06T05:07:03"
 | 
			
		||||
            title.metadata.url = "http://images.com"
 | 
			
		||||
 | 
			
		||||
            narr_text = NarrativeText("To understand bar you must first understand foo.")
 | 
			
		||||
            narr_text.metadata.file_directory = "x/y/images"
 | 
			
		||||
            narr_text.metadata.filename = "image.jpeg"
 | 
			
		||||
            narr_text.metadata.filetype = "image/jpeg"
 | 
			
		||||
            narr_text.metadata.last_modified = "2020-01-06T05:07:03"
 | 
			
		||||
            narr_text.metadata.url = "http://images.com"
 | 
			
		||||
 | 
			
		||||
            return [title, narr_text]
 | 
			
		||||
 | 
			
		||||
        return fake_partitioner
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ================================================================================================
 | 
			
		||||
# HASH IDS
 | 
			
		||||
# ================================================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_assign_hash_ids_produces_unique_and_deterministic_SHA1_ids_even_for_duplicate_elements():
 | 
			
		||||
    elements: list[Element] = [
 | 
			
		||||
        Text(text="Element", metadata=ElementMetadata(filename="foo.bar", page_number=1)),
 | 
			
		||||
        Text(text="Element", metadata=ElementMetadata(filename="foo.bar", page_number=1)),
 | 
			
		||||
        Text(text="Element", metadata=ElementMetadata(filename="foo.bar", page_number=1)),
 | 
			
		||||
    ]
 | 
			
		||||
    # -- default ids are UUIDs --
 | 
			
		||||
    assert all(len(e.id) == 36 for e in elements)
 | 
			
		||||
 | 
			
		||||
    elements = _assign_hash_ids(copy.deepcopy(elements))
 | 
			
		||||
    elements_2 = _assign_hash_ids(copy.deepcopy(elements))
 | 
			
		||||
 | 
			
		||||
    ids = [e.id for e in elements]
 | 
			
		||||
    # -- ids are now SHA1 --
 | 
			
		||||
    assert all(len(e.id) == 32 for e in elements)
 | 
			
		||||
    # -- each id is unique --
 | 
			
		||||
    assert len(ids) == len(set(ids))
 | 
			
		||||
    # -- ids are deterministic, same value is computed each time --
 | 
			
		||||
    assert all(e.id == e2.id for e, e2 in zip(elements, elements_2))
 | 
			
		||||
 | 
			
		||||
@ -1 +1 @@
 | 
			
		||||
__version__ = "0.15.14-dev5"  # pragma: no cover
 | 
			
		||||
__version__ = "0.15.14-dev6"  # pragma: no cover
 | 
			
		||||
 | 
			
		||||
@ -2,14 +2,16 @@
 | 
			
		||||
 | 
			
		||||
from __future__ import annotations
 | 
			
		||||
 | 
			
		||||
import copy
 | 
			
		||||
import datetime as dt
 | 
			
		||||
import functools
 | 
			
		||||
import itertools
 | 
			
		||||
import os
 | 
			
		||||
from typing import Any, Callable, Sequence
 | 
			
		||||
from typing import Any, Callable, Iterator, Sequence
 | 
			
		||||
 | 
			
		||||
from typing_extensions import ParamSpec
 | 
			
		||||
 | 
			
		||||
from unstructured.documents.elements import Element, ElementMetadata, assign_and_map_hash_ids
 | 
			
		||||
from unstructured.documents.elements import Element, ElementMetadata
 | 
			
		||||
from unstructured.file_utils.model import FileType
 | 
			
		||||
from unstructured.partition.common.lang import apply_lang_metadata
 | 
			
		||||
from unstructured.utils import get_call_args_applying_defaults
 | 
			
		||||
@ -139,6 +141,8 @@ def apply_metadata(
 | 
			
		||||
 | 
			
		||||
      - Replace `filename` with `metadata_filename` when present.
 | 
			
		||||
 | 
			
		||||
      - Replace `last_modified` with `metadata_last_modified` when present.
 | 
			
		||||
 | 
			
		||||
      - Apply `url` metadata when present.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
@ -155,14 +159,21 @@ def apply_metadata(
 | 
			
		||||
            elements = func(*args, **kwargs)
 | 
			
		||||
            call_args = get_call_args_applying_defaults(func, *args, **kwargs)
 | 
			
		||||
 | 
			
		||||
            # -- Compute and apply hash-ids if the user does not want UUIDs. Note this changes the
 | 
			
		||||
            # -- elements themselves, not the metadata.
 | 
			
		||||
            unique_element_ids: bool = call_args.get("unique_element_ids", False)
 | 
			
		||||
            if unique_element_ids is False:
 | 
			
		||||
                elements = assign_and_map_hash_ids(elements)
 | 
			
		||||
            # ------------------------------------------------------------------------------------
 | 
			
		||||
            # unique-ify elements
 | 
			
		||||
            # ------------------------------------------------------------------------------------
 | 
			
		||||
            # Do this first to ensure all following operations behave as expected. It's easy for a
 | 
			
		||||
            # partitioner to re-use an element or metadata instance when its values are common to
 | 
			
		||||
            # multiple elements. This can lead to very hard-to diagnose bugs downstream when
 | 
			
		||||
            # mutating one element unexpectedly also mutates others (because they are the same
 | 
			
		||||
            # instance).
 | 
			
		||||
            # ------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
            # -- `parent_id` - process category-level etc. to assign parent-id --
 | 
			
		||||
            elements = set_element_hierarchy(elements)
 | 
			
		||||
            elements = _uniqueify_elements_and_metadata(elements)
 | 
			
		||||
 | 
			
		||||
            # ------------------------------------------------------------------------------------
 | 
			
		||||
            # apply metadata - do this first because it affects the hash computation.
 | 
			
		||||
            # ------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
            # -- `language` - auto-detect language (e.g. eng, spa) --
 | 
			
		||||
            languages = call_args.get("languages")
 | 
			
		||||
@ -175,7 +186,7 @@ def apply_metadata(
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            # == apply filetype, filename, and url metadata =========================
 | 
			
		||||
            # == apply filetype, filename, last_modified, and url metadata ===================
 | 
			
		||||
            metadata_kwargs: dict[str, Any] = {}
 | 
			
		||||
 | 
			
		||||
            # -- `filetype` (MIME-type) metadata --
 | 
			
		||||
@ -188,6 +199,11 @@ def apply_metadata(
 | 
			
		||||
            if filename:
 | 
			
		||||
                metadata_kwargs["filename"] = filename
 | 
			
		||||
 | 
			
		||||
            # -- `last_modified` metadata - override with metadata_last_modified when present --
 | 
			
		||||
            metadata_last_modified = call_args.get("metadata_last_modified")
 | 
			
		||||
            if metadata_last_modified:
 | 
			
		||||
                metadata_kwargs["last_modified"] = metadata_last_modified
 | 
			
		||||
 | 
			
		||||
            # -- `url` metadata - record url when present --
 | 
			
		||||
            url = call_args.get("url")
 | 
			
		||||
            if url:
 | 
			
		||||
@ -201,8 +217,70 @@ def apply_metadata(
 | 
			
		||||
                    continue
 | 
			
		||||
                element.metadata.update(ElementMetadata(**metadata_kwargs))
 | 
			
		||||
 | 
			
		||||
            # ------------------------------------------------------------------------------------
 | 
			
		||||
            # compute hash ids (when so requestsd)
 | 
			
		||||
            # ------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
            # -- Compute and apply hash-ids if the user does not want UUIDs. Note this mutates the
 | 
			
		||||
            # -- elements themselves, not their metadata.
 | 
			
		||||
            unique_element_ids: bool = call_args.get("unique_element_ids", False)
 | 
			
		||||
            if unique_element_ids is False:
 | 
			
		||||
                elements = _assign_hash_ids(elements)
 | 
			
		||||
 | 
			
		||||
            # ------------------------------------------------------------------------------------
 | 
			
		||||
            # assign parent-id - do this after hash computation so parent-id is stable.
 | 
			
		||||
            # ------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
            # -- `parent_id` - process category-level etc. to assign parent-id --
 | 
			
		||||
            elements = set_element_hierarchy(elements)
 | 
			
		||||
 | 
			
		||||
            return elements
 | 
			
		||||
 | 
			
		||||
        return wrapper
 | 
			
		||||
 | 
			
		||||
    return decorator
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _assign_hash_ids(elements: list[Element]) -> list[Element]:
 | 
			
		||||
    """Converts `.id` of each element from UUID to hash.
 | 
			
		||||
 | 
			
		||||
    The hash is based on the `.text` of the element, but also on its page-number and sequence number
 | 
			
		||||
    on that page. This provides for deterministic results even when the document is split into one
 | 
			
		||||
    or more fragments for parallel processing.
 | 
			
		||||
    """
 | 
			
		||||
    # -- generate sequence number for each element on a page --
 | 
			
		||||
    page_numbers = [e.metadata.page_number for e in elements]
 | 
			
		||||
    page_seq_numbers = [
 | 
			
		||||
        seq_on_page
 | 
			
		||||
        for _, group in itertools.groupby(page_numbers)
 | 
			
		||||
        for seq_on_page, _ in enumerate(group)
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    for element, seq_on_page_counter in zip(elements, page_seq_numbers):
 | 
			
		||||
        element.id_to_hash(seq_on_page_counter)
 | 
			
		||||
 | 
			
		||||
    return elements
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _uniqueify_elements_and_metadata(elements: list[Element]) -> list[Element]:
 | 
			
		||||
    """Ensure each of `elements` and their metadata are unique instances.
 | 
			
		||||
 | 
			
		||||
    This prevents hard-to-diagnose bugs downstream when mutating one element unexpectedly also
 | 
			
		||||
    mutates others because they are the same instance.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def iter_unique_elements(elements: list[Element]) -> Iterator[Element]:
 | 
			
		||||
        """Substitute deep-copies of any non-unique elements or metadata in `elements`."""
 | 
			
		||||
        seen_elements: set[int] = set()
 | 
			
		||||
        seen_metadata: set[int] = set()
 | 
			
		||||
 | 
			
		||||
        for element in elements:
 | 
			
		||||
            if id(element) in seen_elements:
 | 
			
		||||
                element = copy.deepcopy(element)
 | 
			
		||||
            if id(element.metadata) in seen_metadata:
 | 
			
		||||
                element.metadata = copy.deepcopy(element.metadata)
 | 
			
		||||
            seen_elements.add(id(element))
 | 
			
		||||
            seen_metadata.add(id(element.metadata))
 | 
			
		||||
            yield element
 | 
			
		||||
 | 
			
		||||
    return list(iter_unique_elements(elements))
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user