feat: unique_element_ids kwarg for UUID elements (#1085)

* added kwarg for unique elements * test for unique ids * update docs * changelog and version
2025-11-09 23:17:21 +00:00 · 2023-08-11 07:02:37 -04:00 · 2023-08-11 07:02:37 -04:00 · fa5a3dbd81
commit fa5a3dbd81
parent d26ab1deac
6 changed files with 54 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,8 @@
 ### Enhancements
 * Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID
  for element IDs instead of a SHA-256 hash.
 * Add functionality to switch `html` text parser based on whether the `html` text contains emoji
 * Add functionality to check if a string contains any emoji characters
@ -15,6 +17,7 @@
 * Update table extraction section in API documentation to sync with change in Prod API
 * Update Notion connector to extract to html
 * Added UUID option for `element_id`
 * Bump unstructured-inference==0.5.9:
  - better caching of models
  - another version of detectron2 available, though the default layout model is unchanged
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@ -210,6 +210,24 @@ a list of elements from JSON, as seen in the snippet below
    elements = elements_from_json(filename=filename)
 ###################
 Unique Element IDs
 ###################
 By default, the element ID is a SHA-256 hash of the element text. This is to ensure that
 the ID is deterministic. One downside is that the ID is not guaranteed to be unique.
 Different elements with the same text will have the same ID, and there could also
 be hash collisions. To use UUIDs in the output instead, you can pass
 ``unique_element_ids=True`` into any of the partition functions. This can be helpful
 if you'd like to use the IDs as a primary key in a database, for example.
 .. code:: python
    from unstructured.partition.text import partition_text
    elements = partition_text(text="Here is some example text.", unique_element_ids=True)
    elements[0].id
 ##################
 Wrapping it all up
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -21,7 +21,7 @@ Library Documentation
  Learn more about partitioning, cleaning, and staging bricks, including advanced usage patterns.
 :doc:`connectors`
-  Connect to your favortite data storage platforms for an efortless batch processing of your files.
+  Connect to your favorite data storage platforms for an effortless batch processing of your files.
 :doc:`metadata`
  Learn more about how metadata is tracked in the ``unstructured`` library.
--- a/test_unstructured/documents/test_elements.py
+++ b/test_unstructured/documents/test_elements.py
@ -1,4 +1,4 @@
-import uuid
+import json
 from functools import partial
 import pytest
@ -26,7 +26,10 @@ def test_text_id():
 def test_text_uuid():
    text_element = Text(text="hello there!", element_id=UUID())
-    assert isinstance(text_element.id, uuid.UUID)
+    assert len(text_element.id) == 36
    assert text_element.id.count("-") == 4
    # Test that the element is JSON serializable. This shold run without an error
    json.dumps(text_element.to_dict())
 def test_element_defaults_to_blank_id():
--- a/test_unstructured/partition/test_text.py
+++ b/test_unstructured/partition/test_text.py
@ -1,3 +1,4 @@
 import json
 import os
 import pathlib
@ -464,3 +465,16 @@ def test_partition_text_from_text_with_custom_metadata_date(
    elements = partition_text(text=text, metadata_last_modified=expected_last_modification_date)
    assert elements[0].metadata.last_modified == expected_last_modification_date
 def test_partition_text_with_unique_ids():
    elements = partition_text(text="hello there!")
    assert elements[0].id == "c69509590d81db2f37f9d75480c8efed"
    # Test that the element is JSON serializable. This should run without an error
    json.dumps(elements[0].to_dict())
    elements = partition_text(text="hello there!", unique_element_ids=True)
    assert len(elements[0].id) == 36
    assert elements[0].id.count("-") == 4
    # Test that the element is JSON serializable. This should run without an error
    json.dumps(elements[0].to_dict())
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -239,6 +239,11 @@ def process_metadata():
            regex_metadata: Dict["str", "str"] = params.get("regex_metadata", {})
            elements = _add_regex_metadata(elements, regex_metadata)
            unique_element_ids: bool = params.get("unique_element_ids", False)
            if unique_element_ids:
                for element in elements:
                    element.id_to_uuid()
            return elements
        return wrapper
@ -246,6 +251,10 @@ def process_metadata():
    return decorator
 def _elements_ids_to_uuid():
    pass
 def _add_regex_metadata(
    elements: List[Element],
    regex_metadata: Dict[str, str] = {},
@ -300,6 +309,9 @@ class Element(ABC):
        )
        self.metadata = metadata.merge(ElementMetadata(coordinates=coordinates_metadata))
    def id_to_uuid(self):
        self.id = str(uuid.uuid4())
    def to_dict(self) -> dict:
        return {
            "type": None,
@ -385,7 +397,7 @@ class Text(Element):
            element_id = hashlib.sha256(text.encode()).hexdigest()[:32]
        elif isinstance(element_id, UUID):
-            element_id = uuid.uuid4()
+            element_id = str(uuid.uuid4())
        super().__init__(
            element_id=element_id,