mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 19:13:13 +00:00
feat: unique_element_ids kwarg for UUID elements (#1085)
* added kwarg for unique elements * test for unique ids * update docs * changelog and version
This commit is contained in:
parent
d26ab1deac
commit
fa5a3dbd81
@ -2,6 +2,8 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID
|
||||
for element IDs instead of a SHA-256 hash.
|
||||
* Add functionality to switch `html` text parser based on whether the `html` text contains emoji
|
||||
* Add functionality to check if a string contains any emoji characters
|
||||
|
||||
@ -15,6 +17,7 @@
|
||||
|
||||
* Update table extraction section in API documentation to sync with change in Prod API
|
||||
* Update Notion connector to extract to html
|
||||
* Added UUID option for `element_id`
|
||||
* Bump unstructured-inference==0.5.9:
|
||||
- better caching of models
|
||||
- another version of detectron2 available, though the default layout model is unchanged
|
||||
|
||||
@ -210,6 +210,24 @@ a list of elements from JSON, as seen in the snippet below
|
||||
elements = elements_from_json(filename=filename)
|
||||
|
||||
|
||||
###################
|
||||
Unique Element IDs
|
||||
###################
|
||||
|
||||
By default, the element ID is a SHA-256 hash of the element text. This is to ensure that
|
||||
the ID is deterministic. One downside is that the ID is not guaranteed to be unique.
|
||||
Different elements with the same text will have the same ID, and there could also
|
||||
be hash collisions. To use UUIDs in the output instead, you can pass
|
||||
``unique_element_ids=True`` into any of the partition functions. This can be helpful
|
||||
if you'd like to use the IDs as a primary key in a database, for example.
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
elements = partition_text(text="Here is some example text.", unique_element_ids=True)
|
||||
elements[0].id
|
||||
|
||||
|
||||
##################
|
||||
Wrapping it all up
|
||||
|
||||
@ -21,7 +21,7 @@ Library Documentation
|
||||
Learn more about partitioning, cleaning, and staging bricks, including advanced usage patterns.
|
||||
|
||||
:doc:`connectors`
|
||||
Connect to your favortite data storage platforms for an efortless batch processing of your files.
|
||||
Connect to your favorite data storage platforms for an effortless batch processing of your files.
|
||||
|
||||
:doc:`metadata`
|
||||
Learn more about how metadata is tracked in the ``unstructured`` library.
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
import uuid
|
||||
import json
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
@ -26,7 +26,10 @@ def test_text_id():
|
||||
|
||||
def test_text_uuid():
|
||||
text_element = Text(text="hello there!", element_id=UUID())
|
||||
assert isinstance(text_element.id, uuid.UUID)
|
||||
assert len(text_element.id) == 36
|
||||
assert text_element.id.count("-") == 4
|
||||
# Test that the element is JSON serializable. This shold run without an error
|
||||
json.dumps(text_element.to_dict())
|
||||
|
||||
|
||||
def test_element_defaults_to_blank_id():
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
@ -464,3 +465,16 @@ def test_partition_text_from_text_with_custom_metadata_date(
|
||||
elements = partition_text(text=text, metadata_last_modified=expected_last_modification_date)
|
||||
|
||||
assert elements[0].metadata.last_modified == expected_last_modification_date
|
||||
|
||||
|
||||
def test_partition_text_with_unique_ids():
|
||||
elements = partition_text(text="hello there!")
|
||||
assert elements[0].id == "c69509590d81db2f37f9d75480c8efed"
|
||||
# Test that the element is JSON serializable. This should run without an error
|
||||
json.dumps(elements[0].to_dict())
|
||||
|
||||
elements = partition_text(text="hello there!", unique_element_ids=True)
|
||||
assert len(elements[0].id) == 36
|
||||
assert elements[0].id.count("-") == 4
|
||||
# Test that the element is JSON serializable. This should run without an error
|
||||
json.dumps(elements[0].to_dict())
|
||||
|
||||
@ -239,6 +239,11 @@ def process_metadata():
|
||||
regex_metadata: Dict["str", "str"] = params.get("regex_metadata", {})
|
||||
elements = _add_regex_metadata(elements, regex_metadata)
|
||||
|
||||
unique_element_ids: bool = params.get("unique_element_ids", False)
|
||||
if unique_element_ids:
|
||||
for element in elements:
|
||||
element.id_to_uuid()
|
||||
|
||||
return elements
|
||||
|
||||
return wrapper
|
||||
@ -246,6 +251,10 @@ def process_metadata():
|
||||
return decorator
|
||||
|
||||
|
||||
def _elements_ids_to_uuid():
|
||||
pass
|
||||
|
||||
|
||||
def _add_regex_metadata(
|
||||
elements: List[Element],
|
||||
regex_metadata: Dict[str, str] = {},
|
||||
@ -300,6 +309,9 @@ class Element(ABC):
|
||||
)
|
||||
self.metadata = metadata.merge(ElementMetadata(coordinates=coordinates_metadata))
|
||||
|
||||
def id_to_uuid(self):
|
||||
self.id = str(uuid.uuid4())
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"type": None,
|
||||
@ -385,7 +397,7 @@ class Text(Element):
|
||||
element_id = hashlib.sha256(text.encode()).hexdigest()[:32]
|
||||
|
||||
elif isinstance(element_id, UUID):
|
||||
element_id = uuid.uuid4()
|
||||
element_id = str(uuid.uuid4())
|
||||
|
||||
super().__init__(
|
||||
element_id=element_id,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user