feat: unique_element_ids kwarg for UUID elements (#1085)

* added kwarg for unique elements

* test for unique ids

* update docs

* changelog and version
This commit is contained in:
Matt Robinson 2023-08-11 07:02:37 -04:00 committed by GitHub
parent d26ab1deac
commit fa5a3dbd81
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 54 additions and 4 deletions

View File

@ -2,6 +2,8 @@
### Enhancements
* Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID
for element IDs instead of a SHA-256 hash.
* Add functionality to switch `html` text parser based on whether the `html` text contains emoji
* Add functionality to check if a string contains any emoji characters
@ -15,6 +17,7 @@
* Update table extraction section in API documentation to sync with change in Prod API
* Update Notion connector to extract to html
* Added UUID option for `element_id`
* Bump unstructured-inference==0.5.9:
- better caching of models
- another version of detectron2 available, though the default layout model is unchanged

View File

@ -210,6 +210,24 @@ a list of elements from JSON, as seen in the snippet below
elements = elements_from_json(filename=filename)
###################
Unique Element IDs
###################
By default, the element ID is a SHA-256 hash of the element text. This is to ensure that
the ID is deterministic. One downside is that the ID is not guaranteed to be unique.
Different elements with the same text will have the same ID, and there could also
be hash collisions. To use UUIDs in the output instead, you can pass
``unique_element_ids=True`` into any of the partition functions. This can be helpful
if you'd like to use the IDs as a primary key in a database, for example.
.. code:: python
from unstructured.partition.text import partition_text
elements = partition_text(text="Here is some example text.", unique_element_ids=True)
elements[0].id
##################
Wrapping it all up

View File

@ -21,7 +21,7 @@ Library Documentation
Learn more about partitioning, cleaning, and staging bricks, including advanced usage patterns.
:doc:`connectors`
Connect to your favortite data storage platforms for an efortless batch processing of your files.
Connect to your favorite data storage platforms for an effortless batch processing of your files.
:doc:`metadata`
Learn more about how metadata is tracked in the ``unstructured`` library.

View File

@ -1,4 +1,4 @@
import uuid
import json
from functools import partial
import pytest
@ -26,7 +26,10 @@ def test_text_id():
def test_text_uuid():
text_element = Text(text="hello there!", element_id=UUID())
assert isinstance(text_element.id, uuid.UUID)
assert len(text_element.id) == 36
assert text_element.id.count("-") == 4
# Test that the element is JSON serializable. This shold run without an error
json.dumps(text_element.to_dict())
def test_element_defaults_to_blank_id():

View File

@ -1,3 +1,4 @@
import json
import os
import pathlib
@ -464,3 +465,16 @@ def test_partition_text_from_text_with_custom_metadata_date(
elements = partition_text(text=text, metadata_last_modified=expected_last_modification_date)
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_text_with_unique_ids():
elements = partition_text(text="hello there!")
assert elements[0].id == "c69509590d81db2f37f9d75480c8efed"
# Test that the element is JSON serializable. This should run without an error
json.dumps(elements[0].to_dict())
elements = partition_text(text="hello there!", unique_element_ids=True)
assert len(elements[0].id) == 36
assert elements[0].id.count("-") == 4
# Test that the element is JSON serializable. This should run without an error
json.dumps(elements[0].to_dict())

View File

@ -239,6 +239,11 @@ def process_metadata():
regex_metadata: Dict["str", "str"] = params.get("regex_metadata", {})
elements = _add_regex_metadata(elements, regex_metadata)
unique_element_ids: bool = params.get("unique_element_ids", False)
if unique_element_ids:
for element in elements:
element.id_to_uuid()
return elements
return wrapper
@ -246,6 +251,10 @@ def process_metadata():
return decorator
def _elements_ids_to_uuid():
pass
def _add_regex_metadata(
elements: List[Element],
regex_metadata: Dict[str, str] = {},
@ -300,6 +309,9 @@ class Element(ABC):
)
self.metadata = metadata.merge(ElementMetadata(coordinates=coordinates_metadata))
def id_to_uuid(self):
self.id = str(uuid.uuid4())
def to_dict(self) -> dict:
return {
"type": None,
@ -385,7 +397,7 @@ class Text(Element):
element_id = hashlib.sha256(text.encode()).hexdigest()[:32]
elif isinstance(element_id, UUID):
element_id = uuid.uuid4()
element_id = str(uuid.uuid4())
super().__init__(
element_id=element_id,