# pyright: reportPrivateUsage=false
"""Test-suite for `unstructured.documents.elements` module."""
from __future__ import annotations
import copy
import json
import pathlib
from functools import partial
import pytest
from test_unstructured.unit_utils import assign_hash_ids
from unstructured.cleaners.core import clean_bullets, clean_prefix
from unstructured.documents.coordinates import (
CoordinateSystem,
Orientation,
RelativeCoordinateSystem,
)
from unstructured.documents.elements import (
CheckBox,
ConsolidationStrategy,
CoordinatesMetadata,
DataSourceMetadata,
Element,
ElementMetadata,
Points,
RegexMetadata,
Text,
Title,
assign_and_map_hash_ids,
)
@pytest.mark.parametrize("element", [Element(), Text(text=""), CheckBox()])
def test_Element_autoassigns_a_UUID_then_becomes_an_idempotent_and_deterministic_hash(
element: Element,
):
# -- element self-assigns itself a UUID --
assert isinstance(element.id, str)
assert len(element.id) == 36
assert element.id.count("-") == 4
expected_hash = "5336294a19f32ff03ef80066fbc3e0f7"
# -- calling `.id_to_hash()` changes the element's id-type to hash --
assert element.id_to_hash(0) == expected_hash
assert element.id == expected_hash
# -- `.id_to_hash()` is idempotent --
assert element.id_to_hash(0) == expected_hash
assert element.id == expected_hash
def test_Text_is_JSON_serializable():
# -- This shold run without an error --
json.dumps(Text(text="hello there!", element_id=None).to_dict())
@pytest.mark.parametrize(
"element",
[
Element(),
Text(text=""), # -- element_id should be implicitly None --
Text(text="", element_id=None), # -- setting explicitly to None --
CheckBox(),
],
)
def test_Element_self_assigns_itself_a_UUID_id(element: Element):
assert isinstance(element.id, str)
assert len(element.id) == 36
assert element.id.count("-") == 4
def test_text_element_apply_cleaners():
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
text_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
assert str(text_element) == "A Textbook on Crocodile Habitats"
def test_text_element_apply_multiple_cleaners():
cleaners = [partial(clean_prefix, pattern=r"\[\d{1,2}\]"), partial(clean_bullets)]
text_element = Text(text="[1] \u2022 A Textbook on Crocodile Habitats")
text_element.apply(*cleaners)
assert str(text_element) == "A Textbook on Crocodile Habitats"
def test_non_text_elements_are_serializable_to_text():
element = CheckBox()
assert hasattr(element, "text")
assert element.text is not None
assert element.text == ""
assert str(element) == ""
def test_apply_raises_if_func_does_not_produce_string():
def bad_cleaner(s: str):
return 1
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
with pytest.raises(ValueError, match="Cleaner produced a non-string output."):
text_element.apply(bad_cleaner) # pyright: ignore[reportArgumentType]
@pytest.mark.parametrize(
("coordinates", "orientation1", "orientation2", "expected_coords"),
[
(
((1, 2), (1, 4), (3, 4), (3, 2)),
Orientation.CARTESIAN,
Orientation.CARTESIAN,
((10, 20), (10, 40), (30, 40), (30, 20)),
),
(
((1, 2), (1, 4), (3, 4), (3, 2)),
Orientation.CARTESIAN,
Orientation.SCREEN,
((10, 1980), (10, 1960), (30, 1960), (30, 1980)),
),
(
((1, 2), (1, 4), (3, 4), (3, 2)),
Orientation.SCREEN,
Orientation.CARTESIAN,
((10, 1980), (10, 1960), (30, 1960), (30, 1980)),
),
(
((1, 2), (1, 4), (3, 4), (3, 2)),
Orientation.SCREEN,
Orientation.SCREEN,
((10, 20), (10, 40), (30, 40), (30, 20)),
),
],
)
def test_convert_coordinates_to_new_system(
coordinates: Points,
orientation1: Orientation,
orientation2: Orientation,
expected_coords: Points,
):
coord1 = CoordinateSystem(100, 200)
coord1.orientation = orientation1
coord2 = CoordinateSystem(1000, 2000)
coord2.orientation = orientation2
element = Element(coordinates=coordinates, coordinate_system=coord1)
new_coords = element.convert_coordinates_to_new_system(coord2)
assert new_coords is not None
for new_coord, expected in zip(new_coords, expected_coords):
assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType]
element.convert_coordinates_to_new_system(coord2, in_place=True)
assert element.metadata.coordinates is not None
assert element.metadata.coordinates.points is not None
for new_coord, expected in zip(element.metadata.coordinates.points, expected_coords):
assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType]
assert element.metadata.coordinates.system == coord2
def test_convert_coordinate_to_new_system_none():
element = Element(coordinates=None, coordinate_system=None)
coord = CoordinateSystem(100, 200)
coord.orientation = Orientation.SCREEN
assert element.convert_coordinates_to_new_system(coord) is None
def test_element_constructor_coordinates_all_present():
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
coordinate_system = RelativeCoordinateSystem()
element = Element(coordinates=coordinates, coordinate_system=coordinate_system)
expected_coordinates_metadata = CoordinatesMetadata(
points=coordinates,
system=coordinate_system,
)
assert element.metadata.coordinates == expected_coordinates_metadata
def test_element_constructor_coordinates_points_absent():
with pytest.raises(ValueError) as exc_info:
Element(coordinate_system=RelativeCoordinateSystem())
assert (
str(exc_info.value)
== "Coordinates points should not exist without coordinates system and vice versa."
)
def test_element_constructor_coordinates_system_absent():
with pytest.raises(ValueError) as exc_info:
Element(coordinates=((1, 2), (1, 4), (3, 4), (3, 2)))
assert (
str(exc_info.value)
== "Coordinates points should not exist without coordinates system and vice versa."
)
def test_coordinate_metadata_serdes():
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
coordinate_system = RelativeCoordinateSystem()
coordinates_metadata = CoordinatesMetadata(points=coordinates, system=coordinate_system)
expected_schema = {
"layout_height": 1,
"layout_width": 1,
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
"system": "RelativeCoordinateSystem",
}
coordinates_metadata_dict = coordinates_metadata.to_dict()
assert coordinates_metadata_dict == expected_schema
assert CoordinatesMetadata.from_dict(coordinates_metadata_dict) == coordinates_metadata
def test_element_to_dict():
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
coordinate_system = RelativeCoordinateSystem()
element = Element(
element_id="awt32t1",
coordinates=coordinates,
coordinate_system=coordinate_system,
)
assert element.to_dict() == {
"metadata": {
"coordinates": {
"layout_height": 1,
"layout_width": 1,
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
"system": "RelativeCoordinateSystem",
},
},
"type": None,
"text": "",
"element_id": "awt32t1",
}
def test_regex_metadata_round_trips_through_JSON():
"""metadata.regex_metadata should appear at full depth in JSON."""
regex_metadata = {
"mail-stop": [RegexMetadata(text="MS-107", start=18, end=24)],
"version": [
RegexMetadata(text="current=v1.7.2", start=7, end=21),
RegexMetadata(text="supersedes=v1.7.2", start=22, end=40),
],
}
metadata = ElementMetadata(regex_metadata=regex_metadata)
metadata_json = json.dumps(metadata.to_dict())
deserialized_metadata = ElementMetadata.from_dict(json.loads(metadata_json))
reserialized_metadata_json = json.dumps(deserialized_metadata.to_dict())
assert reserialized_metadata_json == metadata_json
class DescribeElementMetadata:
"""Unit-test suite for `unstructured.documents.elements.ElementMetadata`."""
# -- It can be constructed with known keyword arguments. In particular, including a non-known
# -- keyword argument produces a type-error at development time and raises an exception at
# -- runtime. This catches typos before they reach production.
def it_detects_unknown_constructor_args_at_both_development_time_and_runtime(self):
with pytest.raises(TypeError, match="got an unexpected keyword argument 'file_name'"):
ElementMetadata(file_name="memo.docx") # pyright: ignore[reportCallIssue]
@pytest.mark.parametrize(
"file_path",
[
pathlib.Path("documents/docx") / "memos" / "memo-2023-11-10.docx",
"documents/docx/memos/memo-2023-11-10.docx",
],
)
def it_accommodates_either_a_pathlib_Path_or_str_for_its_filename_arg(
self, file_path: pathlib.Path | str
):
meta = ElementMetadata(filename=file_path)
assert meta.file_directory == "documents/docx/memos"
assert meta.filename == "memo-2023-11-10.docx"
def it_leaves_both_filename_and_file_directory_None_when_neither_is_specified(self):
meta = ElementMetadata()
assert meta.file_directory is None
assert meta.filename is None
@pytest.mark.parametrize("file_path", [pathlib.Path("memo.docx"), "memo.docx"])
def and_it_leaves_file_directory_None_when_not_specified_and_filename_is_not_a_path(
self, file_path: pathlib.Path | str
):
meta = ElementMetadata(filename=file_path)
assert meta.file_directory is None
assert meta.filename == "memo.docx"
def and_it_splits_off_directory_path_from_its_filename_arg_when_it_is_a_file_path(self):
meta = ElementMetadata(filename="documents/docx/memo-2023-11-11.docx")
assert meta.file_directory == "documents/docx"
assert meta.filename == "memo-2023-11-11.docx"
def but_it_prefers_a_specified_file_directory_when_filename_also_contains_a_path(self):
meta = ElementMetadata(filename="tmp/staging/memo.docx", file_directory="documents/docx")
assert meta.file_directory == "documents/docx"
assert meta.filename == "memo.docx"
# -- It knows the types of its known members so type-checking support is available. --
def it_knows_the_types_of_its_known_members_so_type_checking_support_is_available(self):
ElementMetadata(
category_depth="2", # pyright: ignore[reportArgumentType]
file_directory=True, # pyright: ignore[reportArgumentType]
text_as_html=42, # pyright: ignore[reportArgumentType]
)
# -- it does not check types at runtime however (choosing to avoid validation overhead) --
# -- It only stores a field's value when it is not None. --
def it_returns_the_value_of_an_attribute_it_has(self):
meta = ElementMetadata(url="https://google.com")
assert "url" in meta.__dict__
assert meta.url == "https://google.com"
def and_it_returns_None_for_a_known_attribute_it_does_not_have(self):
meta = ElementMetadata()
assert "url" not in meta.__dict__
assert meta.url is None
def but_it_raises_AttributeError_for_an_unknown_attribute_it_does_not_have(self):
meta = ElementMetadata()
assert "coefficient" not in meta.__dict__
with pytest.raises(AttributeError, match="object has no attribute 'coefficient'"):
meta.coefficient
def it_stores_a_non_None_field_value_when_assigned(self):
meta = ElementMetadata()
assert "file_directory" not in meta.__dict__
meta.file_directory = "tmp/"
assert "file_directory" in meta.__dict__
assert meta.file_directory == "tmp/"
def it_removes_a_field_when_None_is_assigned_to_it(self):
meta = ElementMetadata(file_directory="tmp/")
assert "file_directory" in meta.__dict__
assert meta.file_directory == "tmp/"
meta.file_directory = None
assert "file_directory" not in meta.__dict__
assert meta.file_directory is None
# -- It can serialize itself to a dict -------------------------------------------------------
def it_can_serialize_itself_to_a_dict(self):
meta = ElementMetadata(
category_depth=1,
file_directory="tmp/",
page_number=2,
text_as_html="
",
url="https://google.com",
)
assert meta.to_dict() == {
"category_depth": 1,
"file_directory": "tmp/",
"page_number": 2,
"text_as_html": "",
"url": "https://google.com",
}
def and_it_serializes_a_coordinates_sub_object_to_a_dict_when_it_is_present(self):
meta = ElementMetadata(
category_depth=1,
coordinates=CoordinatesMetadata(
points=((2, 2), (1, 4), (3, 4), (3, 2)),
system=RelativeCoordinateSystem(),
),
page_number=2,
)
assert meta.to_dict() == {
"category_depth": 1,
"coordinates": {
"layout_height": 1,
"layout_width": 1,
"points": ((2, 2), (1, 4), (3, 4), (3, 2)),
"system": "RelativeCoordinateSystem",
},
"page_number": 2,
}
def and_it_serializes_a_data_source_sub_object_to_a_dict_when_it_is_present(self):
meta = ElementMetadata(
category_depth=1,
data_source=DataSourceMetadata(
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
date_created="2023-11-09",
),
page_number=2,
)
assert meta.to_dict() == {
"category_depth": 1,
"data_source": {
"url": "https://www.nih.gov/about-nih/who-we-are/nih-director",
"date_created": "2023-11-09",
},
"page_number": 2,
}
def and_it_serializes_an_orig_elements_sub_object_to_base64_when_it_is_present(self):
elements = assign_hash_ids([Title("Lorem"), Text("Lorem Ipsum")])
meta = ElementMetadata(
category_depth=1,
orig_elements=elements,
page_number=2,
)
assert meta.to_dict() == {
"category_depth": 1,
"orig_elements": (
"eJyFzcsKwjAQheFXKVm7MGkzbXwDocu6EpFcTqTQG3UEtfTdbZa"
"6cTnDd/jPi0CHHgNf2yAOmXCljjqXoErKoIw3hqJRXlPuyphrEr"
"tM9GAbLNvNL+t2M56ctvU4o0+AXxPSo2m5g9jIb6VwBE0VBSujp"
"1LJ6EiRLpwiSBf3fyvZcbo/vlqnwVvGbZzbN0KT7Hr5AG/eQyM="
),
"page_number": 2,
}
def but_unlike_in_ElementMetadata_unknown_fields_in_sub_objects_are_ignored(self):
"""Metadata sub-objects ignore fields they do not explicitly define.
This is _not_ the case for ElementMetadata itself where an non-known field is welcomed as a
user-defined ad-hoc metadata field.
"""
element_metadata = {
"new_field": "hello",
"data_source": {
"new_field": "world",
},
"coordinates": {
"new_field": "foo",
},
}
metadata = ElementMetadata.from_dict(element_metadata)
metadata_dict = metadata.to_dict()
assert "new_field" in metadata_dict
assert "new_field" not in metadata_dict["coordinates"]
assert "new_field" not in metadata_dict["data_source"]
# -- It can deserialize itself from a dict ---------------------------------------------------
def it_can_deserialize_itself_from_a_dict(self):
meta_dict = {
"category_depth": 1,
"coefficient": 0.58,
"coordinates": {
"layout_height": 4,
"layout_width": 2,
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
"system": "RelativeCoordinateSystem",
},
"data_source": {
"url": "https://www.nih.gov/about-nih/who-we-are/nih-director",
"date_created": "2023-11-09",
},
"languages": ["eng"],
}
meta = ElementMetadata.from_dict(meta_dict)
# -- known fields present in dict are present in meta --
assert meta.category_depth == 1
# -- known sub-object fields present in dict are present in meta --
assert meta.coordinates == CoordinatesMetadata(
points=((1, 2), (1, 4), (3, 4), (3, 2)),
system=RelativeCoordinateSystem(),
)
assert meta.data_source == DataSourceMetadata(
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
date_created="2023-11-09",
)
# -- known fields absent from dict report None but are not present in meta --
assert meta.file_directory is None
assert "file_directory" not in meta.__dict__
# -- non-known fields present in dict are present in meta (we have no way to tell whether
# -- they are "ad-hoc" or not because we lack indication of user-intent)
assert meta.coefficient == 0.58
# -- ad-hoc fields absent from dict raise on attempted access --
with pytest.raises(AttributeError, match="ntMetadata' object has no attribute 'quotient'"):
meta.quotient
# -- but that can be worked around by end-user --
assert (meta.quotient if hasattr(meta, "quotient") else None) is None
# -- mutating a mutable (collection) field does not affect the original value --
assert isinstance(meta.languages, list)
assert meta.languages == ["eng"]
meta.languages.append("spa")
assert meta.languages == ["eng", "spa"]
assert meta_dict["languages"] == ["eng"]
# -- It allows downstream users to add an arbitrary new member by assignment. ----------------
def it_allows_an_end_user_to_add_an_arbitrary_field(self):
meta = ElementMetadata()
meta.foobar = 7
assert "foobar" in meta.__dict__
assert meta.foobar == 7
def and_fields_so_added_appear_in_the_metadata_JSON(self):
meta = ElementMetadata()
meta.foobar = 7
assert meta.to_dict() == {"foobar": 7}
def and_it_removes_an_end_user_field_when_it_is_assigned_None(self):
meta = ElementMetadata()
meta.foobar = 7
assert "foobar" in meta.__dict__
meta.foobar = None
assert "foobar" not in meta.__dict__
with pytest.raises(
AttributeError, match="'ElementMetadata' object has no attribute 'foobar'"
):
meta.foobar
# -- It can update itself from another instance ----------------------------------------------
def it_can_update_itself_from_another_instance(self):
meta = ElementMetadata(category_depth=1, page_number=1)
meta.coefficient = 0.58
meta.stem_length = 18
other = ElementMetadata(file_directory="tmp/", page_number=2)
other.quotient = 1.4
other.stem_length = 20
meta.update(other)
# -- known-fields present on self but not other are unchanged --
assert meta.category_depth == 1
# -- known-fields present on other but not self are added --
assert meta.file_directory == "tmp/"
# -- known-fields present on both self and other are updated --
assert meta.page_number == 2
# -- ad-hoc-fields present on self but not other are unchanged --
assert meta.coefficient == 0.58
# -- ad-hoc-fields present on other but not self are added --
assert meta.quotient == 1.4
# -- ad-hoc-fields present on both self and other are updated --
assert meta.stem_length == 20
# -- other is left unchanged --
assert other.category_depth is None
assert other.file_directory == "tmp/"
assert other.page_number == 2
assert other.text_as_html is None
assert other.url is None
assert other.quotient == 1.4
assert other.stem_length == 20
with pytest.raises(AttributeError, match="etadata' object has no attribute 'coefficient'"):
other.coefficient
def but_it_raises_on_attempt_to_update_from_a_non_ElementMetadata_object(self):
meta = ElementMetadata()
with pytest.raises(ValueError, match=r"ate\(\)' must be an instance of 'ElementMetadata'"):
meta.update({"coefficient": "0.56"}) # pyright: ignore[reportArgumentType]
# -- It knows when it is equal to another instance -------------------------------------------
def it_is_equal_to_another_instance_with_the_same_known_field_values(self):
meta = ElementMetadata(
category_depth=1,
coordinates=CoordinatesMetadata(
points=((1, 2), (1, 4), (3, 4), (3, 2)),
system=RelativeCoordinateSystem(),
),
data_source=DataSourceMetadata(
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
date_created="2023-11-08",
),
file_directory="tmp/",
languages=["eng"],
page_number=2,
text_as_html="",
url="https://google.com",
)
assert meta == ElementMetadata(
category_depth=1,
coordinates=CoordinatesMetadata(
points=((1, 2), (1, 4), (3, 4), (3, 2)),
system=RelativeCoordinateSystem(),
),
data_source=DataSourceMetadata(
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
date_created="2023-11-08",
),
file_directory="tmp/",
languages=["eng"],
page_number=2,
text_as_html="",
url="https://google.com",
)
def but_it_is_never_equal_to_a_non_ElementMetadata_object(self):
class NotElementMetadata:
pass
meta = ElementMetadata()
other = NotElementMetadata()
# -- all the "fields" are the same --
assert meta.__dict__ == other.__dict__
# -- but it is rejected solely because its type is different --
assert meta != other
def it_is_equal_to_another_instance_with_the_same_ad_hoc_field_values(self):
meta = ElementMetadata(category_depth=1)
meta.coefficient = 0.58
other = ElementMetadata(category_depth=1)
other.coefficient = 0.58
assert meta == other
def but_it_is_not_equal_to_an_instance_with_ad_hoc_fields_that_differ(self):
meta = ElementMetadata(category_depth=1)
meta.coefficient = 0.58
other = ElementMetadata(category_depth=1)
other.coefficient = 0.72
assert meta != other
def it_is_not_equal_when_a_list_field_contains_different_items(self):
meta = ElementMetadata(languages=["eng"])
assert meta != ElementMetadata(languages=["eng", "spa"])
def and_it_is_not_equal_when_the_coordinates_sub_object_field_differs(self):
meta = ElementMetadata(
coordinates=CoordinatesMetadata(
points=((1, 2), (1, 4), (3, 4), (3, 2)),
system=RelativeCoordinateSystem(),
)
)
assert meta != ElementMetadata(
coordinates=CoordinatesMetadata(
points=((2, 2), (2, 4), (3, 4), (4, 2)),
system=RelativeCoordinateSystem(),
)
)
def and_it_is_not_equal_when_the_data_source_sub_object_field_differs(self):
meta = ElementMetadata(
data_source=DataSourceMetadata(
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
date_created="2023-11-08",
)
)
assert meta != ElementMetadata(
data_source=DataSourceMetadata(
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
date_created="2023-11-09",
)
)
# -- There is a consolidation-strategy for all known fields ----------------------------------
def it_can_find_the_consolidation_strategy_for_each_of_its_known_fields(self):
metadata = ElementMetadata()
metadata_field_names = sorted(metadata._known_field_names)
consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
for field_name in metadata_field_names:
assert field_name in consolidation_strategies, (
f"ElementMetadata field `.{field_name}` does not have a consolidation strategy."
f" Add one in `ConsolidationStrategy.field_consolidation_strategies()."
)
def test_hash_ids_are_unique_for_duplicate_elements():
# GIVEN
parent = Text(text="Parent", metadata=ElementMetadata(page_number=1))
elements = [
parent,
Text(text="Element", metadata=ElementMetadata(page_number=1, parent_id=parent.id)),
Text(text="Element", metadata=ElementMetadata(page_number=1, parent_id=parent.id)),
]
# WHEN
updated_elements = assign_and_map_hash_ids(copy.deepcopy(elements))
ids = [element.id for element in updated_elements]
# THEN
assert len(ids) == len(set(ids)), "Recalculated IDs must be unique."
assert elements[1].metadata.parent_id == elements[2].metadata.parent_id
for idx, updated_element in enumerate(updated_elements):
assert updated_element.id != elements[idx].id, "IDs haven't changed after recalculation"
if updated_element.metadata.parent_id is not None:
assert updated_element.metadata.parent_id in ids, "Parent ID not in the list of IDs"
assert (
updated_element.metadata.parent_id != elements[idx].metadata.parent_id
), "Parent ID hasn't changed after recalculation"
def test_hash_ids_are_deterministic():
parent = Text(text="Parent", metadata=ElementMetadata(page_number=1))
elements = [
parent,
Text(text="Element", metadata=ElementMetadata(page_number=1, parent_id=parent.id)),
Text(text="Element", metadata=ElementMetadata(page_number=1, parent_id=parent.id)),
]
updated_elements = assign_and_map_hash_ids(elements)
ids = [element.id for element in updated_elements]
parent_ids = [element.metadata.parent_id for element in updated_elements]
assert ids == [
"ea9eb7e80383c190f8cafce1ad666624",
"4112a8d24886276e18e759d06956021b",
"eba84bbe7f03e8b91a1527323040ee3d",
]
assert parent_ids == [
None,
"ea9eb7e80383c190f8cafce1ad666624",
"ea9eb7e80383c190f8cafce1ad666624",
]
@pytest.mark.parametrize(
("text", "sequence_number", "filename", "page_number", "expected_hash"),
[
# -- pdf files support page numbers --
("foo", 1, "foo.pdf", 1, "4bb264eb23ceb44cd8fcc5af44f8dc71"),
("foo", 2, "foo.pdf", 1, "75fc1de48cf724ec00aa8d1c5a0d3758"),
# -- txt files don't have a page number --
("some text", 0, "some.txt", None, "1a2627b5760c06b1440102f11a1edb0f"),
("some text", 1, "some.txt", None, "e3fd10d867c4a1c0264dde40e3d7e45a"),
],
)
def test_id_to_hash_calculates(text, sequence_number, filename, page_number, expected_hash):
element = Text(
text=text,
metadata=ElementMetadata(filename=filename, page_number=page_number),
)
assert element.id_to_hash(sequence_number) == expected_hash, "Returned ID does not match"
assert element.id == expected_hash, "ID should be set"