mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 06:04:53 +00:00
feat: added UUID option for element_id arg in element constructor (#1076)
* added UUID option for element_id arg in element constructor and updated unit tests * updated CHANGELOG and bumped to dev2
This commit is contained in:
parent
112347aa0d
commit
ef5091f276
@ -1,10 +1,11 @@
|
||||
## 0.9.2-dev1
|
||||
## 0.9.2-dev2
|
||||
=======
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Update table extraction section in API documentation to sync with change in Prod API
|
||||
* Update Notion connector to extract to html
|
||||
* Added UUID option for element_id
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import uuid
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
@ -9,7 +10,13 @@ from unstructured.documents.coordinates import (
|
||||
Orientation,
|
||||
RelativeCoordinateSystem,
|
||||
)
|
||||
from unstructured.documents.elements import CoordinatesMetadata, Element, NoID, Text
|
||||
from unstructured.documents.elements import (
|
||||
UUID,
|
||||
CoordinatesMetadata,
|
||||
Element,
|
||||
NoID,
|
||||
Text,
|
||||
)
|
||||
|
||||
|
||||
def test_text_id():
|
||||
@ -17,11 +24,21 @@ def test_text_id():
|
||||
assert text_element.id == "c69509590d81db2f37f9d75480c8efed"
|
||||
|
||||
|
||||
def test_text_uuid():
|
||||
text_element = Text(text="hello there!", element_id=UUID())
|
||||
assert isinstance(text_element.id, uuid.UUID)
|
||||
|
||||
|
||||
def test_element_defaults_to_blank_id():
|
||||
element = Element()
|
||||
assert isinstance(element.id, NoID)
|
||||
|
||||
|
||||
def test_element_uuid():
|
||||
element = Element(element_id=UUID())
|
||||
assert isinstance(element.id, UUID)
|
||||
|
||||
|
||||
def test_text_element_apply_cleaners():
|
||||
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
||||
|
||||
|
||||
@ -1,10 +1,11 @@
|
||||
import uuid
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.cleaners.core import clean_prefix
|
||||
from unstructured.cleaners.translate import translate_text
|
||||
from unstructured.documents.email_elements import EmailElement, Name, NoID
|
||||
from unstructured.documents.email_elements import UUID, EmailElement, Name, NoID
|
||||
|
||||
|
||||
def test_text_id():
|
||||
@ -12,6 +13,11 @@ def test_text_id():
|
||||
assert name_element.id == "c69509590d81db2f37f9d75480c8efed"
|
||||
|
||||
|
||||
def test_text_uuid():
|
||||
name_element = Name(name="Example", text="hello there!", element_id=UUID())
|
||||
assert isinstance(name_element.id, uuid.UUID)
|
||||
|
||||
|
||||
def test_element_defaults_to_blank_id():
|
||||
element = EmailElement()
|
||||
assert isinstance(element.id, NoID)
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.9.2-dev1" # pragma: no cover
|
||||
__version__ = "0.9.2-dev2" # pragma: no cover
|
||||
|
||||
@ -6,6 +6,7 @@ import inspect
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
import uuid
|
||||
from abc import ABC
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
@ -25,6 +26,12 @@ class NoID(ABC):
|
||||
pass
|
||||
|
||||
|
||||
class UUID(ABC):
|
||||
"""Class to indicate that an element should have a UUID."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataSourceMetadata:
|
||||
"""Metadata fields that pertain to the data source of the document."""
|
||||
@ -273,14 +280,14 @@ class Element(ABC):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
element_id: Union[str, NoID] = NoID(),
|
||||
element_id: Union[str, uuid.UUID, NoID, UUID] = NoID(),
|
||||
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
|
||||
coordinate_system: Optional[CoordinateSystem] = None,
|
||||
metadata: Optional[ElementMetadata] = None,
|
||||
):
|
||||
if metadata is None:
|
||||
metadata = ElementMetadata()
|
||||
self.id: Union[str, NoID] = element_id
|
||||
self.id: Union[str, uuid.UUID, NoID, UUID] = element_id
|
||||
coordinates_metadata = (
|
||||
None
|
||||
if coordinates is None and coordinate_system is None
|
||||
@ -329,7 +336,7 @@ class CheckBox(Element):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
element_id: Union[str, NoID] = NoID(),
|
||||
element_id: Union[str, uuid.UUID, NoID, UUID] = NoID(),
|
||||
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
|
||||
coordinate_system: Optional[CoordinateSystem] = None,
|
||||
checked: bool = False,
|
||||
@ -365,7 +372,7 @@ class Text(Element):
|
||||
def __init__(
|
||||
self,
|
||||
text: str,
|
||||
element_id: Union[str, NoID] = NoID(),
|
||||
element_id: Union[str, uuid.UUID, NoID, UUID] = NoID(),
|
||||
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
|
||||
coordinate_system: Optional[CoordinateSystem] = None,
|
||||
metadata: Optional[ElementMetadata] = None,
|
||||
@ -377,6 +384,9 @@ class Text(Element):
|
||||
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
|
||||
element_id = hashlib.sha256(text.encode()).hexdigest()[:32]
|
||||
|
||||
elif isinstance(element_id, UUID):
|
||||
element_id = uuid.uuid4()
|
||||
|
||||
super().__init__(
|
||||
element_id=element_id,
|
||||
metadata=metadata,
|
||||
|
||||
@ -1,9 +1,10 @@
|
||||
import hashlib
|
||||
import uuid
|
||||
from abc import ABC
|
||||
from datetime import datetime
|
||||
from typing import Callable, List, Union
|
||||
|
||||
from unstructured.documents.elements import Element, NoID, Text
|
||||
from unstructured.documents.elements import UUID, Element, NoID, Text
|
||||
|
||||
|
||||
class NoDatestamp(ABC):
|
||||
@ -28,7 +29,7 @@ class Name(EmailElement):
|
||||
name: str,
|
||||
text: str,
|
||||
datestamp: Union[datetime, NoDatestamp] = NoDatestamp(),
|
||||
element_id: Union[str, NoID] = NoID(),
|
||||
element_id: Union[str, uuid.UUID, NoID, UUID] = NoID(),
|
||||
):
|
||||
self.name: str = name
|
||||
self.text: str = text
|
||||
@ -37,6 +38,9 @@ class Name(EmailElement):
|
||||
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
|
||||
element_id = hashlib.sha256(text.encode()).hexdigest()[:32]
|
||||
|
||||
elif isinstance(element_id, UUID):
|
||||
element_id = uuid.uuid4()
|
||||
|
||||
super().__init__(element_id=element_id)
|
||||
|
||||
if isinstance(datestamp, datetime):
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
import os
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional, Sequence, Union
|
||||
|
||||
from unstructured.documents.elements import NoID, Text
|
||||
from unstructured.documents.elements import UUID, NoID, Text
|
||||
|
||||
VALID_ATTACHMENT_TYPES: List[str] = ["IMAGE", "VIDEO", "RAW_TEXT", "TEXT_URL", "HTML"]
|
||||
|
||||
@ -49,7 +50,7 @@ def stage_for_label_box(
|
||||
Stages documents to be uploaded to LabelBox and generates LabelBox configuration.
|
||||
ref: https://docs.labelbox.com/reference/data-import-format-overview
|
||||
"""
|
||||
ids: Sequence[Union[str, NoID]]
|
||||
ids: Sequence[Union[str, uuid.UUID, NoID, UUID]]
|
||||
if (external_ids is not None) and len(external_ids) != len(elements):
|
||||
raise ValueError(
|
||||
"The external_ids parameter must be a list and the length of external_ids parameter "
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user