mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-09 23:17:21 +00:00
feat: unique_element_ids kwarg for UUID elements (#1085)
* added kwarg for unique elements * test for unique ids * update docs * changelog and version
This commit is contained in:
parent
d26ab1deac
commit
fa5a3dbd81
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
* Add `unique_element_ids` kwarg to partition functions. If `True`, will use a UUID
|
||||||
|
for element IDs instead of a SHA-256 hash.
|
||||||
* Add functionality to switch `html` text parser based on whether the `html` text contains emoji
|
* Add functionality to switch `html` text parser based on whether the `html` text contains emoji
|
||||||
* Add functionality to check if a string contains any emoji characters
|
* Add functionality to check if a string contains any emoji characters
|
||||||
|
|
||||||
@ -15,6 +17,7 @@
|
|||||||
|
|
||||||
* Update table extraction section in API documentation to sync with change in Prod API
|
* Update table extraction section in API documentation to sync with change in Prod API
|
||||||
* Update Notion connector to extract to html
|
* Update Notion connector to extract to html
|
||||||
|
* Added UUID option for `element_id`
|
||||||
* Bump unstructured-inference==0.5.9:
|
* Bump unstructured-inference==0.5.9:
|
||||||
- better caching of models
|
- better caching of models
|
||||||
- another version of detectron2 available, though the default layout model is unchanged
|
- another version of detectron2 available, though the default layout model is unchanged
|
||||||
|
|||||||
@ -210,6 +210,24 @@ a list of elements from JSON, as seen in the snippet below
|
|||||||
elements = elements_from_json(filename=filename)
|
elements = elements_from_json(filename=filename)
|
||||||
|
|
||||||
|
|
||||||
|
###################
|
||||||
|
Unique Element IDs
|
||||||
|
###################
|
||||||
|
|
||||||
|
By default, the element ID is a SHA-256 hash of the element text. This is to ensure that
|
||||||
|
the ID is deterministic. One downside is that the ID is not guaranteed to be unique.
|
||||||
|
Different elements with the same text will have the same ID, and there could also
|
||||||
|
be hash collisions. To use UUIDs in the output instead, you can pass
|
||||||
|
``unique_element_ids=True`` into any of the partition functions. This can be helpful
|
||||||
|
if you'd like to use the IDs as a primary key in a database, for example.
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.partition.text import partition_text
|
||||||
|
|
||||||
|
elements = partition_text(text="Here is some example text.", unique_element_ids=True)
|
||||||
|
elements[0].id
|
||||||
|
|
||||||
|
|
||||||
##################
|
##################
|
||||||
Wrapping it all up
|
Wrapping it all up
|
||||||
|
|||||||
@ -21,7 +21,7 @@ Library Documentation
|
|||||||
Learn more about partitioning, cleaning, and staging bricks, including advanced usage patterns.
|
Learn more about partitioning, cleaning, and staging bricks, including advanced usage patterns.
|
||||||
|
|
||||||
:doc:`connectors`
|
:doc:`connectors`
|
||||||
Connect to your favortite data storage platforms for an efortless batch processing of your files.
|
Connect to your favorite data storage platforms for an effortless batch processing of your files.
|
||||||
|
|
||||||
:doc:`metadata`
|
:doc:`metadata`
|
||||||
Learn more about how metadata is tracked in the ``unstructured`` library.
|
Learn more about how metadata is tracked in the ``unstructured`` library.
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
import uuid
|
import json
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@ -26,7 +26,10 @@ def test_text_id():
|
|||||||
|
|
||||||
def test_text_uuid():
|
def test_text_uuid():
|
||||||
text_element = Text(text="hello there!", element_id=UUID())
|
text_element = Text(text="hello there!", element_id=UUID())
|
||||||
assert isinstance(text_element.id, uuid.UUID)
|
assert len(text_element.id) == 36
|
||||||
|
assert text_element.id.count("-") == 4
|
||||||
|
# Test that the element is JSON serializable. This shold run without an error
|
||||||
|
json.dumps(text_element.to_dict())
|
||||||
|
|
||||||
|
|
||||||
def test_element_defaults_to_blank_id():
|
def test_element_defaults_to_blank_id():
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
@ -464,3 +465,16 @@ def test_partition_text_from_text_with_custom_metadata_date(
|
|||||||
elements = partition_text(text=text, metadata_last_modified=expected_last_modification_date)
|
elements = partition_text(text=text, metadata_last_modified=expected_last_modification_date)
|
||||||
|
|
||||||
assert elements[0].metadata.last_modified == expected_last_modification_date
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_text_with_unique_ids():
|
||||||
|
elements = partition_text(text="hello there!")
|
||||||
|
assert elements[0].id == "c69509590d81db2f37f9d75480c8efed"
|
||||||
|
# Test that the element is JSON serializable. This should run without an error
|
||||||
|
json.dumps(elements[0].to_dict())
|
||||||
|
|
||||||
|
elements = partition_text(text="hello there!", unique_element_ids=True)
|
||||||
|
assert len(elements[0].id) == 36
|
||||||
|
assert elements[0].id.count("-") == 4
|
||||||
|
# Test that the element is JSON serializable. This should run without an error
|
||||||
|
json.dumps(elements[0].to_dict())
|
||||||
|
|||||||
@ -239,6 +239,11 @@ def process_metadata():
|
|||||||
regex_metadata: Dict["str", "str"] = params.get("regex_metadata", {})
|
regex_metadata: Dict["str", "str"] = params.get("regex_metadata", {})
|
||||||
elements = _add_regex_metadata(elements, regex_metadata)
|
elements = _add_regex_metadata(elements, regex_metadata)
|
||||||
|
|
||||||
|
unique_element_ids: bool = params.get("unique_element_ids", False)
|
||||||
|
if unique_element_ids:
|
||||||
|
for element in elements:
|
||||||
|
element.id_to_uuid()
|
||||||
|
|
||||||
return elements
|
return elements
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
@ -246,6 +251,10 @@ def process_metadata():
|
|||||||
return decorator
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
def _elements_ids_to_uuid():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _add_regex_metadata(
|
def _add_regex_metadata(
|
||||||
elements: List[Element],
|
elements: List[Element],
|
||||||
regex_metadata: Dict[str, str] = {},
|
regex_metadata: Dict[str, str] = {},
|
||||||
@ -300,6 +309,9 @@ class Element(ABC):
|
|||||||
)
|
)
|
||||||
self.metadata = metadata.merge(ElementMetadata(coordinates=coordinates_metadata))
|
self.metadata = metadata.merge(ElementMetadata(coordinates=coordinates_metadata))
|
||||||
|
|
||||||
|
def id_to_uuid(self):
|
||||||
|
self.id = str(uuid.uuid4())
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
def to_dict(self) -> dict:
|
||||||
return {
|
return {
|
||||||
"type": None,
|
"type": None,
|
||||||
@ -385,7 +397,7 @@ class Text(Element):
|
|||||||
element_id = hashlib.sha256(text.encode()).hexdigest()[:32]
|
element_id = hashlib.sha256(text.encode()).hexdigest()[:32]
|
||||||
|
|
||||||
elif isinstance(element_id, UUID):
|
elif isinstance(element_id, UUID):
|
||||||
element_id = uuid.uuid4()
|
element_id = str(uuid.uuid4())
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
element_id=element_id,
|
element_id=element_id,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user