feat: added UUID option for element_id arg in element constructor (#1076)

* added UUID option for element_id arg in element constructor and updated unit tests

* updated CHANGELOG and bumped to dev2
This commit is contained in:
Chris Pappalardo 2023-08-09 15:32:20 -07:00 committed by GitHub
parent 112347aa0d
commit ef5091f276
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 51 additions and 12 deletions

View File

@ -1,10 +1,11 @@
## 0.9.2-dev1
## 0.9.2-dev2
=======
### Enhancements
* Update table extraction section in API documentation to sync with change in Prod API
* Update Notion connector to extract to html
* Added UUID option for element_id
### Features

View File

@ -1,3 +1,4 @@
import uuid
from functools import partial
import pytest
@ -9,7 +10,13 @@ from unstructured.documents.coordinates import (
Orientation,
RelativeCoordinateSystem,
)
from unstructured.documents.elements import CoordinatesMetadata, Element, NoID, Text
from unstructured.documents.elements import (
UUID,
CoordinatesMetadata,
Element,
NoID,
Text,
)
def test_text_id():
@ -17,11 +24,21 @@ def test_text_id():
assert text_element.id == "c69509590d81db2f37f9d75480c8efed"
def test_text_uuid():
text_element = Text(text="hello there!", element_id=UUID())
assert isinstance(text_element.id, uuid.UUID)
def test_element_defaults_to_blank_id():
element = Element()
assert isinstance(element.id, NoID)
def test_element_uuid():
element = Element(element_id=UUID())
assert isinstance(element.id, UUID)
def test_text_element_apply_cleaners():
text_element = Text(text="[1] A Textbook on Crocodile Habitats")

View File

@ -1,10 +1,11 @@
import uuid
from functools import partial
import pytest
from unstructured.cleaners.core import clean_prefix
from unstructured.cleaners.translate import translate_text
from unstructured.documents.email_elements import EmailElement, Name, NoID
from unstructured.documents.email_elements import UUID, EmailElement, Name, NoID
def test_text_id():
@ -12,6 +13,11 @@ def test_text_id():
assert name_element.id == "c69509590d81db2f37f9d75480c8efed"
def test_text_uuid():
name_element = Name(name="Example", text="hello there!", element_id=UUID())
assert isinstance(name_element.id, uuid.UUID)
def test_element_defaults_to_blank_id():
element = EmailElement()
assert isinstance(element.id, NoID)

View File

@ -1 +1 @@
__version__ = "0.9.2-dev1" # pragma: no cover
__version__ = "0.9.2-dev2" # pragma: no cover

View File

@ -6,6 +6,7 @@ import inspect
import os
import pathlib
import re
import uuid
from abc import ABC
from copy import deepcopy
from dataclasses import dataclass
@ -25,6 +26,12 @@ class NoID(ABC):
pass
class UUID(ABC):
"""Class to indicate that an element should have a UUID."""
pass
@dataclass
class DataSourceMetadata:
"""Metadata fields that pertain to the data source of the document."""
@ -273,14 +280,14 @@ class Element(ABC):
def __init__(
self,
element_id: Union[str, NoID] = NoID(),
element_id: Union[str, uuid.UUID, NoID, UUID] = NoID(),
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
coordinate_system: Optional[CoordinateSystem] = None,
metadata: Optional[ElementMetadata] = None,
):
if metadata is None:
metadata = ElementMetadata()
self.id: Union[str, NoID] = element_id
self.id: Union[str, uuid.UUID, NoID, UUID] = element_id
coordinates_metadata = (
None
if coordinates is None and coordinate_system is None
@ -329,7 +336,7 @@ class CheckBox(Element):
def __init__(
self,
element_id: Union[str, NoID] = NoID(),
element_id: Union[str, uuid.UUID, NoID, UUID] = NoID(),
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
coordinate_system: Optional[CoordinateSystem] = None,
checked: bool = False,
@ -365,7 +372,7 @@ class Text(Element):
def __init__(
self,
text: str,
element_id: Union[str, NoID] = NoID(),
element_id: Union[str, uuid.UUID, NoID, UUID] = NoID(),
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
coordinate_system: Optional[CoordinateSystem] = None,
metadata: Optional[ElementMetadata] = None,
@ -377,6 +384,9 @@ class Text(Element):
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
element_id = hashlib.sha256(text.encode()).hexdigest()[:32]
elif isinstance(element_id, UUID):
element_id = uuid.uuid4()
super().__init__(
element_id=element_id,
metadata=metadata,

View File

@ -1,9 +1,10 @@
import hashlib
import uuid
from abc import ABC
from datetime import datetime
from typing import Callable, List, Union
from unstructured.documents.elements import Element, NoID, Text
from unstructured.documents.elements import UUID, Element, NoID, Text
class NoDatestamp(ABC):
@ -28,7 +29,7 @@ class Name(EmailElement):
name: str,
text: str,
datestamp: Union[datetime, NoDatestamp] = NoDatestamp(),
element_id: Union[str, NoID] = NoID(),
element_id: Union[str, uuid.UUID, NoID, UUID] = NoID(),
):
self.name: str = name
self.text: str = text
@ -37,6 +38,9 @@ class Name(EmailElement):
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
element_id = hashlib.sha256(text.encode()).hexdigest()[:32]
elif isinstance(element_id, UUID):
element_id = uuid.uuid4()
super().__init__(element_id=element_id)
if isinstance(datestamp, datetime):

View File

@ -1,7 +1,8 @@
import os
import uuid
from typing import Any, Dict, List, Optional, Sequence, Union
from unstructured.documents.elements import NoID, Text
from unstructured.documents.elements import UUID, NoID, Text
VALID_ATTACHMENT_TYPES: List[str] = ["IMAGE", "VIDEO", "RAW_TEXT", "TEXT_URL", "HTML"]
@ -49,7 +50,7 @@ def stage_for_label_box(
Stages documents to be uploaded to LabelBox and generates LabelBox configuration.
ref: https://docs.labelbox.com/reference/data-import-format-overview
"""
ids: Sequence[Union[str, NoID]]
ids: Sequence[Union[str, uuid.UUID, NoID, UUID]]
if (external_ids is not None) and len(external_ids) != len(elements):
raise ValueError(
"The external_ids parameter must be a list and the length of external_ids parameter "