diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e22f4945..da3934e4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ### Enhancements +* Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID + for element IDs instead of a SHA-256 hash. * Add functionality to switch `html` text parser based on whether the `html` text contains emoji * Add functionality to check if a string contains any emoji characters @@ -15,6 +17,7 @@ * Update table extraction section in API documentation to sync with change in Prod API * Update Notion connector to extract to html +* Added UUID option for `element_id` * Bump unstructured-inference==0.5.9: - better caching of models - another version of detectron2 available, though the default layout model is unchanged diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index ee2353397..9f4a7707d 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -210,6 +210,24 @@ a list of elements from JSON, as seen in the snippet below elements = elements_from_json(filename=filename) +################### +Unique Element IDs +################### + +By default, the element ID is a SHA-256 hash of the element text. This is to ensure that +the ID is deterministic. One downside is that the ID is not guaranteed to be unique. +Different elements with the same text will have the same ID, and there could also +be hash collisions. To use UUIDs in the output instead, you can pass +``unique_element_ids=True`` into any of the partition functions. This can be helpful +if you'd like to use the IDs as a primary key in a database, for example. + +.. code:: python + + from unstructured.partition.text import partition_text + + elements = partition_text(text="Here is some example text.", unique_element_ids=True) + elements[0].id + ################## Wrapping it all up diff --git a/docs/source/index.rst b/docs/source/index.rst index 110841a42..a53fd517a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -21,7 +21,7 @@ Library Documentation Learn more about partitioning, cleaning, and staging bricks, including advanced usage patterns. :doc:`connectors` - Connect to your favortite data storage platforms for an efortless batch processing of your files. + Connect to your favorite data storage platforms for an effortless batch processing of your files. :doc:`metadata` Learn more about how metadata is tracked in the ``unstructured`` library. diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py index 800d929fa..270e16fb6 100644 --- a/test_unstructured/documents/test_elements.py +++ b/test_unstructured/documents/test_elements.py @@ -1,4 +1,4 @@ -import uuid +import json from functools import partial import pytest @@ -26,7 +26,10 @@ def test_text_id(): def test_text_uuid(): text_element = Text(text="hello there!", element_id=UUID()) - assert isinstance(text_element.id, uuid.UUID) + assert len(text_element.id) == 36 + assert text_element.id.count("-") == 4 + # Test that the element is JSON serializable. This shold run without an error + json.dumps(text_element.to_dict()) def test_element_defaults_to_blank_id(): diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py index 74d03d0b4..a5c95545d 100644 --- a/test_unstructured/partition/test_text.py +++ b/test_unstructured/partition/test_text.py @@ -1,3 +1,4 @@ +import json import os import pathlib @@ -464,3 +465,16 @@ def test_partition_text_from_text_with_custom_metadata_date( elements = partition_text(text=text, metadata_last_modified=expected_last_modification_date) assert elements[0].metadata.last_modified == expected_last_modification_date + + +def test_partition_text_with_unique_ids(): + elements = partition_text(text="hello there!") + assert elements[0].id == "c69509590d81db2f37f9d75480c8efed" + # Test that the element is JSON serializable. This should run without an error + json.dumps(elements[0].to_dict()) + + elements = partition_text(text="hello there!", unique_element_ids=True) + assert len(elements[0].id) == 36 + assert elements[0].id.count("-") == 4 + # Test that the element is JSON serializable. This should run without an error + json.dumps(elements[0].to_dict()) diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 9c4052ff2..a421b1063 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -239,6 +239,11 @@ def process_metadata(): regex_metadata: Dict["str", "str"] = params.get("regex_metadata", {}) elements = _add_regex_metadata(elements, regex_metadata) + unique_element_ids: bool = params.get("unique_element_ids", False) + if unique_element_ids: + for element in elements: + element.id_to_uuid() + return elements return wrapper @@ -246,6 +251,10 @@ def process_metadata(): return decorator +def _elements_ids_to_uuid(): + pass + + def _add_regex_metadata( elements: List[Element], regex_metadata: Dict[str, str] = {}, @@ -300,6 +309,9 @@ class Element(ABC): ) self.metadata = metadata.merge(ElementMetadata(coordinates=coordinates_metadata)) + def id_to_uuid(self): + self.id = str(uuid.uuid4()) + def to_dict(self) -> dict: return { "type": None, @@ -385,7 +397,7 @@ class Text(Element): element_id = hashlib.sha256(text.encode()).hexdigest()[:32] elif isinstance(element_id, UUID): - element_id = uuid.uuid4() + element_id = str(uuid.uuid4()) super().__init__( element_id=element_id,