feat: unique_element_ids kwarg for UUID elements (#1085)

* added kwarg for unique elements * test for unique ids * update docs * changelog and version
2025-12-24 05:34:58 +00:00 · 2023-08-11 07:02:37 -04:00 · 2023-08-11 07:02:37 -04:00 · fa5a3dbd81
commit fa5a3dbd81
parent d26ab1deac
6 changed files with 54 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,8 @@

 ### Enhancements

+* Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID
+  for element IDs instead of a SHA-256 hash.
 * Add functionality to switch `html` text parser based on whether the `html` text contains emoji
 * Add functionality to check if a string contains any emoji characters

@ -15,6 +17,7 @@

 * Update table extraction section in API documentation to sync with change in Prod API
 * Update Notion connector to extract to html
+* Added UUID option for `element_id`
 * Bump unstructured-inference==0.5.9:
  - better caching of models
  - another version of detectron2 available, though the default layout model is unchanged
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@ -210,6 +210,24 @@ a list of elements from JSON, as seen in the snippet below
    elements = elements_from_json(filename=filename)


+###################
+Unique Element IDs
+###################
+
+By default, the element ID is a SHA-256 hash of the element text. This is to ensure that
+the ID is deterministic. One downside is that the ID is not guaranteed to be unique.
+Different elements with the same text will have the same ID, and there could also
+be hash collisions. To use UUIDs in the output instead, you can pass
+``unique_element_ids=True`` into any of the partition functions. This can be helpful
+if you'd like to use the IDs as a primary key in a database, for example.
+
+.. code:: python
+
+    from unstructured.partition.text import partition_text
+
+    elements = partition_text(text="Here is some example text.", unique_element_ids=True)
+    elements[0].id
+

 ##################
 Wrapping it all up
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -21,7 +21,7 @@ Library Documentation
  Learn more about partitioning, cleaning, and staging bricks, including advanced usage patterns.

 :doc:`connectors`
-  Connect to your favortite data storage platforms for an efortless batch processing of your files.
+  Connect to your favorite data storage platforms for an effortless batch processing of your files.

 :doc:`metadata`
  Learn more about how metadata is tracked in the ``unstructured`` library.
--- a/test_unstructured/documents/test_elements.py
+++ b/test_unstructured/documents/test_elements.py
@ -1,4 +1,4 @@
-import uuid
+import json
 from functools import partial

 import pytest
@ -26,7 +26,10 @@ def test_text_id():

 def test_text_uuid():
    text_element = Text(text="hello there!", element_id=UUID())
-    assert isinstance(text_element.id, uuid.UUID)
+    assert len(text_element.id) == 36
+    assert text_element.id.count("-") == 4
+    # Test that the element is JSON serializable. This shold run without an error
+    json.dumps(text_element.to_dict())


 def test_element_defaults_to_blank_id():
--- a/test_unstructured/partition/test_text.py
+++ b/test_unstructured/partition/test_text.py
@ -1,3 +1,4 @@
+import json
 import os
 import pathlib

@ -464,3 +465,16 @@ def test_partition_text_from_text_with_custom_metadata_date(
    elements = partition_text(text=text, metadata_last_modified=expected_last_modification_date)

    assert elements[0].metadata.last_modified == expected_last_modification_date
+
+
+def test_partition_text_with_unique_ids():
+    elements = partition_text(text="hello there!")
+    assert elements[0].id == "c69509590d81db2f37f9d75480c8efed"
+    # Test that the element is JSON serializable. This should run without an error
+    json.dumps(elements[0].to_dict())
+
+    elements = partition_text(text="hello there!", unique_element_ids=True)
+    assert len(elements[0].id) == 36
+    assert elements[0].id.count("-") == 4
+    # Test that the element is JSON serializable. This should run without an error
+    json.dumps(elements[0].to_dict())
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -239,6 +239,11 @@ def process_metadata():
            regex_metadata: Dict["str", "str"] = params.get("regex_metadata", {})
            elements = _add_regex_metadata(elements, regex_metadata)

+            unique_element_ids: bool = params.get("unique_element_ids", False)
+            if unique_element_ids:
+                for element in elements:
+                    element.id_to_uuid()
+
            return elements

        return wrapper
@ -246,6 +251,10 @@ def process_metadata():
    return decorator


+def _elements_ids_to_uuid():
+    pass
+
+
 def _add_regex_metadata(
    elements: List[Element],
    regex_metadata: Dict[str, str] = {},
@ -300,6 +309,9 @@ class Element(ABC):
        )
        self.metadata = metadata.merge(ElementMetadata(coordinates=coordinates_metadata))

+    def id_to_uuid(self):
+        self.id = str(uuid.uuid4())
+
    def to_dict(self) -> dict:
        return {
            "type": None,
@ -385,7 +397,7 @@ class Text(Element):
            element_id = hashlib.sha256(text.encode()).hexdigest()[:32]

        elif isinstance(element_id, UUID):
-            element_id = uuid.uuid4()
+            element_id = str(uuid.uuid4())

        super().__init__(
            element_id=element_id,