2023-08-09 15:32:20 -07:00
|
|
|
import uuid
|
2022-12-21 16:03:44 -06:00
|
|
|
from functools import partial
|
2023-02-27 17:30:54 +01:00
|
|
|
|
2022-12-21 16:03:44 -06:00
|
|
|
import pytest
|
|
|
|
|
|
|
|
from unstructured.cleaners.core import clean_prefix
|
|
|
|
from unstructured.cleaners.translate import translate_text
|
2023-08-09 15:32:20 -07:00
|
|
|
from unstructured.documents.email_elements import UUID, EmailElement, Name, NoID
|
2022-12-21 16:03:44 -06:00
|
|
|
|
|
|
|
|
|
|
|
def test_text_id():
|
|
|
|
name_element = Name(name="Example", text="hello there!")
|
|
|
|
assert name_element.id == "c69509590d81db2f37f9d75480c8efed"
|
|
|
|
|
|
|
|
|
2023-08-09 15:32:20 -07:00
|
|
|
def test_text_uuid():
|
|
|
|
name_element = Name(name="Example", text="hello there!", element_id=UUID())
|
|
|
|
assert isinstance(name_element.id, uuid.UUID)
|
|
|
|
|
|
|
|
|
2022-12-21 16:03:44 -06:00
|
|
|
def test_element_defaults_to_blank_id():
|
|
|
|
element = EmailElement()
|
|
|
|
assert isinstance(element.id, NoID)
|
|
|
|
|
|
|
|
|
|
|
|
def test_text_element_apply_cleaners():
|
|
|
|
name_element = Name(name="[2] Example docs", text="[1] A Textbook on Crocodile Habitats")
|
|
|
|
|
|
|
|
name_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
|
|
|
|
assert str(name_element) == "Example docs: A Textbook on Crocodile Habitats"
|
|
|
|
|
|
|
|
|
|
|
|
def test_name_element_apply_multiple_cleaners():
|
|
|
|
cleaners = [
|
|
|
|
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
|
|
|
|
partial(translate_text, target_lang="ru"),
|
|
|
|
]
|
|
|
|
name_element = Name(
|
2023-02-27 17:30:54 +01:00
|
|
|
name="[1] A Textbook on Crocodile Habitats",
|
|
|
|
text="[1] A Textbook on Crocodile Habitats",
|
2022-12-21 16:03:44 -06:00
|
|
|
)
|
|
|
|
name_element.apply(*cleaners)
|
|
|
|
assert (
|
|
|
|
str(name_element)
|
|
|
|
== "Учебник по крокодильным средам обитания: Учебник по крокодильным средам обитания"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_apply_raises_if_func_does_not_produce_string():
|
|
|
|
name_element = Name(name="Example docs", text="[1] A Textbook on Crocodile Habitats")
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
name_element.apply(lambda s: 1)
|