mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-05 08:02:48 +00:00

**Summary** This final PR in the "orig_elements" series adds the needful such that `.metadata.orig_elements`, when present on a chunk (element), is serialized to JSON when the chunk is serialized, for instance, to be used in an HTTP response payload. It also provides for deserializing such a JSON payload into chunks that contain the `.orig_elements` metadata. **Additional Context** Note that `.metadata.orig_elements` is always `Optional[list[Element]]` when in memory. However, those original elements are serialized as Base64-encoded gzipped JSON and are in that form (str) when present as JSON or as "element-dicts" which is an intermediate serialization/deserialization format. That is, serialization is `Element -> dict -> JSON` and deserialization is `JSON -> dict -> Element` and `.orig_elements` are Base64-encoded in both the `dict` and `JSON` forms. --------- Co-authored-by: scanny <scanny@users.noreply.github.com>
652 lines
24 KiB
Python
652 lines
24 KiB
Python
# pyright: reportPrivateUsage=false
|
|
|
|
"""Test-suite for `unstructured.documents.elements` module."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import pathlib
|
|
from functools import partial
|
|
|
|
import pytest
|
|
|
|
from unstructured.cleaners.core import clean_bullets, clean_prefix
|
|
from unstructured.documents.coordinates import (
|
|
CoordinateSystem,
|
|
Orientation,
|
|
RelativeCoordinateSystem,
|
|
)
|
|
from unstructured.documents.elements import (
|
|
UUID,
|
|
ConsolidationStrategy,
|
|
CoordinatesMetadata,
|
|
DataSourceMetadata,
|
|
Element,
|
|
ElementMetadata,
|
|
NoID,
|
|
Points,
|
|
RegexMetadata,
|
|
Text,
|
|
Title,
|
|
)
|
|
|
|
|
|
def test_text_id():
|
|
text_element = Text(text="hello there!")
|
|
assert text_element.id == "c69509590d81db2f37f9d75480c8efed"
|
|
|
|
|
|
def test_text_uuid():
|
|
text_element = Text(text="hello there!", element_id=UUID())
|
|
|
|
id = text_element.id
|
|
|
|
assert isinstance(id, str)
|
|
assert len(id) == 36
|
|
assert id.count("-") == 4
|
|
# -- Test that the element is JSON serializable. This shold run without an error --
|
|
json.dumps(text_element.to_dict())
|
|
|
|
|
|
def test_element_defaults_to_blank_id():
|
|
element = Element()
|
|
assert isinstance(element.id, NoID)
|
|
|
|
|
|
def test_element_uuid():
|
|
element = Element(element_id=UUID())
|
|
assert isinstance(element.id, UUID)
|
|
|
|
|
|
def test_text_element_apply_cleaners():
|
|
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
|
|
|
text_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
|
|
assert str(text_element) == "A Textbook on Crocodile Habitats"
|
|
|
|
|
|
def test_text_element_apply_multiple_cleaners():
|
|
cleaners = [partial(clean_prefix, pattern=r"\[\d{1,2}\]"), partial(clean_bullets)]
|
|
text_element = Text(text="[1] \u2022 A Textbook on Crocodile Habitats")
|
|
text_element.apply(*cleaners)
|
|
assert str(text_element) == "A Textbook on Crocodile Habitats"
|
|
|
|
|
|
def test_apply_raises_if_func_does_not_produce_string():
|
|
def bad_cleaner(s: str):
|
|
return 1
|
|
|
|
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
|
|
|
with pytest.raises(ValueError, match="Cleaner produced a non-string output."):
|
|
text_element.apply(bad_cleaner) # pyright: ignore[reportArgumentType]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("coordinates", "orientation1", "orientation2", "expected_coords"),
|
|
[
|
|
(
|
|
((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
Orientation.CARTESIAN,
|
|
Orientation.CARTESIAN,
|
|
((10, 20), (10, 40), (30, 40), (30, 20)),
|
|
),
|
|
(
|
|
((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
Orientation.CARTESIAN,
|
|
Orientation.SCREEN,
|
|
((10, 1980), (10, 1960), (30, 1960), (30, 1980)),
|
|
),
|
|
(
|
|
((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
Orientation.SCREEN,
|
|
Orientation.CARTESIAN,
|
|
((10, 1980), (10, 1960), (30, 1960), (30, 1980)),
|
|
),
|
|
(
|
|
((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
Orientation.SCREEN,
|
|
Orientation.SCREEN,
|
|
((10, 20), (10, 40), (30, 40), (30, 20)),
|
|
),
|
|
],
|
|
)
|
|
def test_convert_coordinates_to_new_system(
|
|
coordinates: Points,
|
|
orientation1: Orientation,
|
|
orientation2: Orientation,
|
|
expected_coords: Points,
|
|
):
|
|
coord1 = CoordinateSystem(100, 200)
|
|
coord1.orientation = orientation1
|
|
coord2 = CoordinateSystem(1000, 2000)
|
|
coord2.orientation = orientation2
|
|
element = Element(coordinates=coordinates, coordinate_system=coord1)
|
|
|
|
new_coords = element.convert_coordinates_to_new_system(coord2)
|
|
|
|
assert new_coords is not None
|
|
for new_coord, expected in zip(new_coords, expected_coords):
|
|
assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType]
|
|
element.convert_coordinates_to_new_system(coord2, in_place=True)
|
|
assert element.metadata.coordinates is not None
|
|
assert element.metadata.coordinates.points is not None
|
|
for new_coord, expected in zip(element.metadata.coordinates.points, expected_coords):
|
|
assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType]
|
|
assert element.metadata.coordinates.system == coord2
|
|
|
|
|
|
def test_convert_coordinate_to_new_system_none():
|
|
element = Element(coordinates=None, coordinate_system=None)
|
|
coord = CoordinateSystem(100, 200)
|
|
coord.orientation = Orientation.SCREEN
|
|
assert element.convert_coordinates_to_new_system(coord) is None
|
|
|
|
|
|
def test_element_constructor_coordinates_all_present():
|
|
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
|
|
coordinate_system = RelativeCoordinateSystem()
|
|
element = Element(coordinates=coordinates, coordinate_system=coordinate_system)
|
|
expected_coordinates_metadata = CoordinatesMetadata(
|
|
points=coordinates,
|
|
system=coordinate_system,
|
|
)
|
|
assert element.metadata.coordinates == expected_coordinates_metadata
|
|
|
|
|
|
def test_element_constructor_coordinates_points_absent():
|
|
with pytest.raises(ValueError) as exc_info:
|
|
Element(coordinate_system=RelativeCoordinateSystem())
|
|
assert (
|
|
str(exc_info.value)
|
|
== "Coordinates points should not exist without coordinates system and vice versa."
|
|
)
|
|
|
|
|
|
def test_element_constructor_coordinates_system_absent():
|
|
with pytest.raises(ValueError) as exc_info:
|
|
Element(coordinates=((1, 2), (1, 4), (3, 4), (3, 2)))
|
|
assert (
|
|
str(exc_info.value)
|
|
== "Coordinates points should not exist without coordinates system and vice versa."
|
|
)
|
|
|
|
|
|
def test_coordinate_metadata_serdes():
|
|
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
|
|
coordinate_system = RelativeCoordinateSystem()
|
|
coordinates_metadata = CoordinatesMetadata(points=coordinates, system=coordinate_system)
|
|
expected_schema = {
|
|
"layout_height": 1,
|
|
"layout_width": 1,
|
|
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
"system": "RelativeCoordinateSystem",
|
|
}
|
|
coordinates_metadata_dict = coordinates_metadata.to_dict()
|
|
assert coordinates_metadata_dict == expected_schema
|
|
assert CoordinatesMetadata.from_dict(coordinates_metadata_dict) == coordinates_metadata
|
|
|
|
|
|
def test_element_to_dict():
|
|
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
|
|
coordinate_system = RelativeCoordinateSystem()
|
|
element = Element(
|
|
element_id="awt32t1",
|
|
coordinates=coordinates,
|
|
coordinate_system=coordinate_system,
|
|
)
|
|
|
|
assert element.to_dict() == {
|
|
"metadata": {
|
|
"coordinates": {
|
|
"layout_height": 1,
|
|
"layout_width": 1,
|
|
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
"system": "RelativeCoordinateSystem",
|
|
},
|
|
},
|
|
"type": None,
|
|
"text": "",
|
|
"element_id": "awt32t1",
|
|
}
|
|
|
|
|
|
def test_regex_metadata_round_trips_through_JSON():
|
|
"""metadata.regex_metadata should appear at full depth in JSON."""
|
|
regex_metadata = {
|
|
"mail-stop": [RegexMetadata(text="MS-107", start=18, end=24)],
|
|
"version": [
|
|
RegexMetadata(text="current=v1.7.2", start=7, end=21),
|
|
RegexMetadata(text="supersedes=v1.7.2", start=22, end=40),
|
|
],
|
|
}
|
|
metadata = ElementMetadata(regex_metadata=regex_metadata)
|
|
|
|
metadata_json = json.dumps(metadata.to_dict())
|
|
deserialized_metadata = ElementMetadata.from_dict(json.loads(metadata_json))
|
|
reserialized_metadata_json = json.dumps(deserialized_metadata.to_dict())
|
|
|
|
assert reserialized_metadata_json == metadata_json
|
|
|
|
|
|
class DescribeElementMetadata:
|
|
"""Unit-test suite for `unstructured.documents.elements.ElementMetadata`."""
|
|
|
|
# -- It can be constructed with known keyword arguments. In particular, including a non-known
|
|
# -- keyword argument produces a type-error at development time and raises an exception at
|
|
# -- runtime. This catches typos before they reach production.
|
|
|
|
def it_detects_unknown_constructor_args_at_both_development_time_and_runtime(self):
|
|
with pytest.raises(TypeError, match="got an unexpected keyword argument 'file_name'"):
|
|
ElementMetadata(file_name="memo.docx") # pyright: ignore[reportCallIssue]
|
|
|
|
@pytest.mark.parametrize(
|
|
"file_path",
|
|
[
|
|
pathlib.Path("documents/docx") / "memos" / "memo-2023-11-10.docx",
|
|
"documents/docx/memos/memo-2023-11-10.docx",
|
|
],
|
|
)
|
|
def it_accommodates_either_a_pathlib_Path_or_str_for_its_filename_arg(
|
|
self, file_path: pathlib.Path | str
|
|
):
|
|
meta = ElementMetadata(filename=file_path)
|
|
|
|
assert meta.file_directory == "documents/docx/memos"
|
|
assert meta.filename == "memo-2023-11-10.docx"
|
|
|
|
def it_leaves_both_filename_and_file_directory_None_when_neither_is_specified(self):
|
|
meta = ElementMetadata()
|
|
|
|
assert meta.file_directory is None
|
|
assert meta.filename is None
|
|
|
|
@pytest.mark.parametrize("file_path", [pathlib.Path("memo.docx"), "memo.docx"])
|
|
def and_it_leaves_file_directory_None_when_not_specified_and_filename_is_not_a_path(
|
|
self, file_path: pathlib.Path | str
|
|
):
|
|
meta = ElementMetadata(filename=file_path)
|
|
|
|
assert meta.file_directory is None
|
|
assert meta.filename == "memo.docx"
|
|
|
|
def and_it_splits_off_directory_path_from_its_filename_arg_when_it_is_a_file_path(self):
|
|
meta = ElementMetadata(filename="documents/docx/memo-2023-11-11.docx")
|
|
|
|
assert meta.file_directory == "documents/docx"
|
|
assert meta.filename == "memo-2023-11-11.docx"
|
|
|
|
def but_it_prefers_a_specified_file_directory_when_filename_also_contains_a_path(self):
|
|
meta = ElementMetadata(filename="tmp/staging/memo.docx", file_directory="documents/docx")
|
|
|
|
assert meta.file_directory == "documents/docx"
|
|
assert meta.filename == "memo.docx"
|
|
|
|
# -- It knows the types of its known members so type-checking support is available. --
|
|
|
|
def it_knows_the_types_of_its_known_members_so_type_checking_support_is_available(self):
|
|
ElementMetadata(
|
|
category_depth="2", # pyright: ignore[reportArgumentType]
|
|
file_directory=True, # pyright: ignore[reportArgumentType]
|
|
text_as_html=42, # pyright: ignore[reportArgumentType]
|
|
)
|
|
# -- it does not check types at runtime however (choosing to avoid validation overhead) --
|
|
|
|
# -- It only stores a field's value when it is not None. --
|
|
|
|
def it_returns_the_value_of_an_attribute_it_has(self):
|
|
meta = ElementMetadata(url="https://google.com")
|
|
assert "url" in meta.__dict__
|
|
assert meta.url == "https://google.com"
|
|
|
|
def and_it_returns_None_for_a_known_attribute_it_does_not_have(self):
|
|
meta = ElementMetadata()
|
|
assert "url" not in meta.__dict__
|
|
assert meta.url is None
|
|
|
|
def but_it_raises_AttributeError_for_an_unknown_attribute_it_does_not_have(self):
|
|
meta = ElementMetadata()
|
|
assert "coefficient" not in meta.__dict__
|
|
with pytest.raises(AttributeError, match="object has no attribute 'coefficient'"):
|
|
meta.coefficient
|
|
|
|
def it_stores_a_non_None_field_value_when_assigned(self):
|
|
meta = ElementMetadata()
|
|
assert "file_directory" not in meta.__dict__
|
|
meta.file_directory = "tmp/"
|
|
assert "file_directory" in meta.__dict__
|
|
assert meta.file_directory == "tmp/"
|
|
|
|
def it_removes_a_field_when_None_is_assigned_to_it(self):
|
|
meta = ElementMetadata(file_directory="tmp/")
|
|
assert "file_directory" in meta.__dict__
|
|
assert meta.file_directory == "tmp/"
|
|
|
|
meta.file_directory = None
|
|
assert "file_directory" not in meta.__dict__
|
|
assert meta.file_directory is None
|
|
|
|
# -- It can serialize itself to a dict -------------------------------------------------------
|
|
|
|
def it_can_serialize_itself_to_a_dict(self):
|
|
meta = ElementMetadata(
|
|
category_depth=1,
|
|
file_directory="tmp/",
|
|
page_number=2,
|
|
text_as_html="<table></table>",
|
|
url="https://google.com",
|
|
)
|
|
assert meta.to_dict() == {
|
|
"category_depth": 1,
|
|
"file_directory": "tmp/",
|
|
"page_number": 2,
|
|
"text_as_html": "<table></table>",
|
|
"url": "https://google.com",
|
|
}
|
|
|
|
def and_it_serializes_a_coordinates_sub_object_to_a_dict_when_it_is_present(self):
|
|
meta = ElementMetadata(
|
|
category_depth=1,
|
|
coordinates=CoordinatesMetadata(
|
|
points=((2, 2), (1, 4), (3, 4), (3, 2)),
|
|
system=RelativeCoordinateSystem(),
|
|
),
|
|
page_number=2,
|
|
)
|
|
assert meta.to_dict() == {
|
|
"category_depth": 1,
|
|
"coordinates": {
|
|
"layout_height": 1,
|
|
"layout_width": 1,
|
|
"points": ((2, 2), (1, 4), (3, 4), (3, 2)),
|
|
"system": "RelativeCoordinateSystem",
|
|
},
|
|
"page_number": 2,
|
|
}
|
|
|
|
def and_it_serializes_a_data_source_sub_object_to_a_dict_when_it_is_present(self):
|
|
meta = ElementMetadata(
|
|
category_depth=1,
|
|
data_source=DataSourceMetadata(
|
|
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
|
|
date_created="2023-11-09",
|
|
),
|
|
page_number=2,
|
|
)
|
|
assert meta.to_dict() == {
|
|
"category_depth": 1,
|
|
"data_source": {
|
|
"url": "https://www.nih.gov/about-nih/who-we-are/nih-director",
|
|
"date_created": "2023-11-09",
|
|
},
|
|
"page_number": 2,
|
|
}
|
|
|
|
def and_it_serializes_an_orig_elements_sub_object_to_base64_when_it_is_present(self):
|
|
meta = ElementMetadata(
|
|
category_depth=1,
|
|
orig_elements=[Title("Lorem"), Text("Lorem Ipsum")],
|
|
page_number=2,
|
|
)
|
|
assert meta.to_dict() == {
|
|
"category_depth": 1,
|
|
"orig_elements": (
|
|
"eJyFzcsKwjAQheFXKVm7yDS3xjcQXNaViKTJjBR6o46glr67zVI3Lmf4Dv95EdhhjwNf2yT2hYDGUaWt"
|
|
"JVm5WDoqNUL0UoJrqtLHJHaF6JFDChw2v6zbzfjkvD2OM/YZ8GvC/Khb7lBs5LcilUwRyCsblQYTiBQp"
|
|
"ZRxYZcCA/1spDtP98dU6DTEw3sa5fWOqs10vH0cLQn0="
|
|
),
|
|
"page_number": 2,
|
|
}
|
|
|
|
def but_unlike_in_ElementMetadata_unknown_fields_in_sub_objects_are_ignored(self):
|
|
"""Metadata sub-objects ignore fields they do not explicitly define.
|
|
|
|
This is _not_ the case for ElementMetadata itself where an non-known field is welcomed as a
|
|
user-defined ad-hoc metadata field.
|
|
"""
|
|
element_metadata = {
|
|
"new_field": "hello",
|
|
"data_source": {
|
|
"new_field": "world",
|
|
},
|
|
"coordinates": {
|
|
"new_field": "foo",
|
|
},
|
|
}
|
|
|
|
metadata = ElementMetadata.from_dict(element_metadata)
|
|
metadata_dict = metadata.to_dict()
|
|
|
|
assert "new_field" in metadata_dict
|
|
assert "new_field" not in metadata_dict["coordinates"]
|
|
assert "new_field" not in metadata_dict["data_source"]
|
|
|
|
# -- It can deserialize itself from a dict ---------------------------------------------------
|
|
|
|
def it_can_deserialize_itself_from_a_dict(self):
|
|
meta_dict = {
|
|
"category_depth": 1,
|
|
"coefficient": 0.58,
|
|
"coordinates": {
|
|
"layout_height": 4,
|
|
"layout_width": 2,
|
|
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
"system": "RelativeCoordinateSystem",
|
|
},
|
|
"data_source": {
|
|
"url": "https://www.nih.gov/about-nih/who-we-are/nih-director",
|
|
"date_created": "2023-11-09",
|
|
},
|
|
"languages": ["eng"],
|
|
}
|
|
|
|
meta = ElementMetadata.from_dict(meta_dict)
|
|
|
|
# -- known fields present in dict are present in meta --
|
|
assert meta.category_depth == 1
|
|
|
|
# -- known sub-object fields present in dict are present in meta --
|
|
assert meta.coordinates == CoordinatesMetadata(
|
|
points=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
system=RelativeCoordinateSystem(),
|
|
)
|
|
assert meta.data_source == DataSourceMetadata(
|
|
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
|
|
date_created="2023-11-09",
|
|
)
|
|
|
|
# -- known fields absent from dict report None but are not present in meta --
|
|
assert meta.file_directory is None
|
|
assert "file_directory" not in meta.__dict__
|
|
|
|
# -- non-known fields present in dict are present in meta (we have no way to tell whether
|
|
# -- they are "ad-hoc" or not because we lack indication of user-intent)
|
|
assert meta.coefficient == 0.58
|
|
|
|
# -- ad-hoc fields absent from dict raise on attempted access --
|
|
with pytest.raises(AttributeError, match="ntMetadata' object has no attribute 'quotient'"):
|
|
meta.quotient
|
|
|
|
# -- but that can be worked around by end-user --
|
|
assert (meta.quotient if hasattr(meta, "quotient") else None) is None
|
|
|
|
# -- mutating a mutable (collection) field does not affect the original value --
|
|
assert isinstance(meta.languages, list)
|
|
assert meta.languages == ["eng"]
|
|
meta.languages.append("spa")
|
|
assert meta.languages == ["eng", "spa"]
|
|
assert meta_dict["languages"] == ["eng"]
|
|
|
|
# -- It allows downstream users to add an arbitrary new member by assignment. ----------------
|
|
|
|
def it_allows_an_end_user_to_add_an_arbitrary_field(self):
|
|
meta = ElementMetadata()
|
|
meta.foobar = 7
|
|
assert "foobar" in meta.__dict__
|
|
assert meta.foobar == 7
|
|
|
|
def and_fields_so_added_appear_in_the_metadata_JSON(self):
|
|
meta = ElementMetadata()
|
|
meta.foobar = 7
|
|
assert meta.to_dict() == {"foobar": 7}
|
|
|
|
def and_it_removes_an_end_user_field_when_it_is_assigned_None(self):
|
|
meta = ElementMetadata()
|
|
meta.foobar = 7
|
|
assert "foobar" in meta.__dict__
|
|
meta.foobar = None
|
|
assert "foobar" not in meta.__dict__
|
|
with pytest.raises(
|
|
AttributeError, match="'ElementMetadata' object has no attribute 'foobar'"
|
|
):
|
|
meta.foobar
|
|
|
|
# -- It can update itself from another instance ----------------------------------------------
|
|
|
|
def it_can_update_itself_from_another_instance(self):
|
|
meta = ElementMetadata(category_depth=1, page_number=1)
|
|
meta.coefficient = 0.58
|
|
meta.stem_length = 18
|
|
other = ElementMetadata(file_directory="tmp/", page_number=2)
|
|
other.quotient = 1.4
|
|
other.stem_length = 20
|
|
|
|
meta.update(other)
|
|
|
|
# -- known-fields present on self but not other are unchanged --
|
|
assert meta.category_depth == 1
|
|
# -- known-fields present on other but not self are added --
|
|
assert meta.file_directory == "tmp/"
|
|
# -- known-fields present on both self and other are updated --
|
|
assert meta.page_number == 2
|
|
# -- ad-hoc-fields present on self but not other are unchanged --
|
|
assert meta.coefficient == 0.58
|
|
# -- ad-hoc-fields present on other but not self are added --
|
|
assert meta.quotient == 1.4
|
|
# -- ad-hoc-fields present on both self and other are updated --
|
|
assert meta.stem_length == 20
|
|
# -- other is left unchanged --
|
|
assert other.category_depth is None
|
|
assert other.file_directory == "tmp/"
|
|
assert other.page_number == 2
|
|
assert other.text_as_html is None
|
|
assert other.url is None
|
|
assert other.quotient == 1.4
|
|
assert other.stem_length == 20
|
|
with pytest.raises(AttributeError, match="etadata' object has no attribute 'coefficient'"):
|
|
other.coefficient
|
|
|
|
def but_it_raises_on_attempt_to_update_from_a_non_ElementMetadata_object(self):
|
|
meta = ElementMetadata()
|
|
with pytest.raises(ValueError, match=r"ate\(\)' must be an instance of 'ElementMetadata'"):
|
|
meta.update({"coefficient": "0.56"}) # pyright: ignore[reportArgumentType]
|
|
|
|
# -- It knows when it is equal to another instance -------------------------------------------
|
|
|
|
def it_is_equal_to_another_instance_with_the_same_known_field_values(self):
|
|
meta = ElementMetadata(
|
|
category_depth=1,
|
|
coordinates=CoordinatesMetadata(
|
|
points=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
system=RelativeCoordinateSystem(),
|
|
),
|
|
data_source=DataSourceMetadata(
|
|
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
|
|
date_created="2023-11-08",
|
|
),
|
|
file_directory="tmp/",
|
|
languages=["eng"],
|
|
page_number=2,
|
|
text_as_html="<table></table>",
|
|
url="https://google.com",
|
|
)
|
|
assert meta == ElementMetadata(
|
|
category_depth=1,
|
|
coordinates=CoordinatesMetadata(
|
|
points=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
system=RelativeCoordinateSystem(),
|
|
),
|
|
data_source=DataSourceMetadata(
|
|
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
|
|
date_created="2023-11-08",
|
|
),
|
|
file_directory="tmp/",
|
|
languages=["eng"],
|
|
page_number=2,
|
|
text_as_html="<table></table>",
|
|
url="https://google.com",
|
|
)
|
|
|
|
def but_it_is_never_equal_to_a_non_ElementMetadata_object(self):
|
|
class NotElementMetadata:
|
|
pass
|
|
|
|
meta = ElementMetadata()
|
|
other = NotElementMetadata()
|
|
|
|
# -- all the "fields" are the same --
|
|
assert meta.__dict__ == other.__dict__
|
|
# -- but it is rejected solely because its type is different --
|
|
assert meta != other
|
|
|
|
def it_is_equal_to_another_instance_with_the_same_ad_hoc_field_values(self):
|
|
meta = ElementMetadata(category_depth=1)
|
|
meta.coefficient = 0.58
|
|
other = ElementMetadata(category_depth=1)
|
|
other.coefficient = 0.58
|
|
|
|
assert meta == other
|
|
|
|
def but_it_is_not_equal_to_an_instance_with_ad_hoc_fields_that_differ(self):
|
|
meta = ElementMetadata(category_depth=1)
|
|
meta.coefficient = 0.58
|
|
other = ElementMetadata(category_depth=1)
|
|
other.coefficient = 0.72
|
|
|
|
assert meta != other
|
|
|
|
def it_is_not_equal_when_a_list_field_contains_different_items(self):
|
|
meta = ElementMetadata(languages=["eng"])
|
|
assert meta != ElementMetadata(languages=["eng", "spa"])
|
|
|
|
def and_it_is_not_equal_when_the_coordinates_sub_object_field_differs(self):
|
|
meta = ElementMetadata(
|
|
coordinates=CoordinatesMetadata(
|
|
points=((1, 2), (1, 4), (3, 4), (3, 2)),
|
|
system=RelativeCoordinateSystem(),
|
|
)
|
|
)
|
|
assert meta != ElementMetadata(
|
|
coordinates=CoordinatesMetadata(
|
|
points=((2, 2), (2, 4), (3, 4), (4, 2)),
|
|
system=RelativeCoordinateSystem(),
|
|
)
|
|
)
|
|
|
|
def and_it_is_not_equal_when_the_data_source_sub_object_field_differs(self):
|
|
meta = ElementMetadata(
|
|
data_source=DataSourceMetadata(
|
|
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
|
|
date_created="2023-11-08",
|
|
)
|
|
)
|
|
assert meta != ElementMetadata(
|
|
data_source=DataSourceMetadata(
|
|
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
|
|
date_created="2023-11-09",
|
|
)
|
|
)
|
|
|
|
# -- There is a consolidation-strategy for all known fields ----------------------------------
|
|
|
|
def it_can_find_the_consolidation_strategy_for_each_of_its_known_fields(self):
|
|
metadata = ElementMetadata()
|
|
metadata_field_names = sorted(metadata._known_field_names)
|
|
consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
|
|
|
|
for field_name in metadata_field_names:
|
|
assert field_name in consolidation_strategies, (
|
|
f"ElementMetadata field `.{field_name}` does not have a consolidation strategy."
|
|
f" Add one in `ConsolidationStrategy.field_consolidation_strategies()."
|
|
)
|