unstructured/test_unstructured/documents/test_email_elements.py

from functools import partial

import pytest

from unstructured.cleaners.core import clean_prefix
from unstructured.cleaners.translate import translate_text
from unstructured.documents.email_elements import EmailElement, Name, Subject


@pytest.mark.parametrize(
    "element", [EmailElement(text=""), Name(text="", name=""), Subject(text="")]
)
def test_EmailElement_autoassigns_a_UUID_then_becomes_an_idempotent_and_deterministic_hash(
    element: EmailElement,
):
    # -- element self-assigns itself a UUID --
    assert isinstance(element.id, str)
    assert len(element.id) == 36
    assert element.id.count("-") == 4

    expected_hash = "5336294a19f32ff03ef80066fbc3e0f7"
    # -- calling `.id_to_hash()` changes the element's id-type to hash --
    assert element.id_to_hash(0) == expected_hash
    assert element.id == expected_hash

    # -- `.id_to_hash()` is idempotent --
    assert element.id_to_hash(0) == expected_hash


def test_Name_should_assign_a_deterministic_and_an_idempotent_hash():
    element = Name(name="Example", text="hello there!")
    expected_hash = "7d191bcecf80c122578c497de5f0dae7"

    assert element._element_id is None, "Element should not have an ID yet"

    # -- calculating hash for the first time --
    assert element.id_to_hash(0) == expected_hash
    assert element.id == expected_hash

    # -- `.id_to_hash()` is idempotent --
    assert element.id_to_hash(0) == expected_hash
    assert element.id == expected_hash


@pytest.mark.parametrize(
    "element",
    [
        EmailElement(text=""),  # -- the default `element_id` is None --
        Name(name="Example", text="hello there!"),  # -- the default `element_id` is None --
        Name(name="Example", text="hello there!", element_id=None),
    ],
)
def test_EmailElement_assigns_a_UUID_only_once_and_only_at_the_first_id_request(
    element: EmailElement,
):
    assert element._element_id is None, "Element should not have an ID yet"

    # -- this should generate and assign a fresh UUID --
    id_value = element.id

    # -- check that the UUID is valid --
    assert element._element_id is not None, "Element should already have an ID"
    assert isinstance(id_value, str)
    assert len(id_value) == 36
    assert id_value.count("-") == 4

    assert element.id == id_value, "UUID assignment should happen only once"


def test_text_element_apply_cleaners():
    name_element = Name(name="[2] Example docs", text="[1] A Textbook on Crocodile Habitats")

    name_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
    assert str(name_element) == "Example docs: A Textbook on Crocodile Habitats"


def test_name_element_apply_multiple_cleaners():
    cleaners = [
        partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
        partial(translate_text, target_lang="ru"),
    ]
    name_element = Name(
        name="[1] A Textbook on Crocodile Habitats",
        text="[1] A Textbook on Crocodile Habitats",
    )
    name_element.apply(*cleaners)
    assert (
        str(name_element)
        == "Учебник по крокодильным средам обитания: Учебник по крокодильным средам обитания"
    )


def test_apply_raises_if_func_does_not_produce_string():
    name_element = Name(name="Example docs", text="[1] A Textbook on Crocodile Habitats")
    with pytest.raises(ValueError):
        name_element.apply(lambda s: 1)