Steve Canny 56fbaaed10
feat(chunking): add metadata.orig_elements serde (#2680)
**Summary**
This final PR in the "orig_elements" series adds the needful such that
`.metadata.orig_elements`, when present on a chunk (element), is
serialized to JSON when the chunk is serialized, for instance, to be
used in an HTTP response payload.

It also provides for deserializing such a JSON payload into chunks that
contain the `.orig_elements` metadata.

**Additional Context**
Note that `.metadata.orig_elements` is always `Optional[list[Element]]`
when in memory. However, those original elements are serialized as
Base64-encoded gzipped JSON and are in that form (str) when present as
JSON or as "element-dicts" which is an intermediate
serialization/deserialization format. That is, serialization is `Element
-> dict -> JSON` and deserialization is `JSON -> dict -> Element` and
`.orig_elements` are Base64-encoded in both the `dict` and `JSON` forms.

---------

Co-authored-by: scanny <scanny@users.noreply.github.com>
2024-03-22 21:53:26 +00:00

652 lines
24 KiB
Python

# pyright: reportPrivateUsage=false
"""Test-suite for `unstructured.documents.elements` module."""
from __future__ import annotations
import json
import pathlib
from functools import partial
import pytest
from unstructured.cleaners.core import clean_bullets, clean_prefix
from unstructured.documents.coordinates import (
CoordinateSystem,
Orientation,
RelativeCoordinateSystem,
)
from unstructured.documents.elements import (
UUID,
ConsolidationStrategy,
CoordinatesMetadata,
DataSourceMetadata,
Element,
ElementMetadata,
NoID,
Points,
RegexMetadata,
Text,
Title,
)
def test_text_id():
text_element = Text(text="hello there!")
assert text_element.id == "c69509590d81db2f37f9d75480c8efed"
def test_text_uuid():
text_element = Text(text="hello there!", element_id=UUID())
id = text_element.id
assert isinstance(id, str)
assert len(id) == 36
assert id.count("-") == 4
# -- Test that the element is JSON serializable. This shold run without an error --
json.dumps(text_element.to_dict())
def test_element_defaults_to_blank_id():
element = Element()
assert isinstance(element.id, NoID)
def test_element_uuid():
element = Element(element_id=UUID())
assert isinstance(element.id, UUID)
def test_text_element_apply_cleaners():
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
text_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
assert str(text_element) == "A Textbook on Crocodile Habitats"
def test_text_element_apply_multiple_cleaners():
cleaners = [partial(clean_prefix, pattern=r"\[\d{1,2}\]"), partial(clean_bullets)]
text_element = Text(text="[1] \u2022 A Textbook on Crocodile Habitats")
text_element.apply(*cleaners)
assert str(text_element) == "A Textbook on Crocodile Habitats"
def test_apply_raises_if_func_does_not_produce_string():
def bad_cleaner(s: str):
return 1
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
with pytest.raises(ValueError, match="Cleaner produced a non-string output."):
text_element.apply(bad_cleaner) # pyright: ignore[reportArgumentType]
@pytest.mark.parametrize(
("coordinates", "orientation1", "orientation2", "expected_coords"),
[
(
((1, 2), (1, 4), (3, 4), (3, 2)),
Orientation.CARTESIAN,
Orientation.CARTESIAN,
((10, 20), (10, 40), (30, 40), (30, 20)),
),
(
((1, 2), (1, 4), (3, 4), (3, 2)),
Orientation.CARTESIAN,
Orientation.SCREEN,
((10, 1980), (10, 1960), (30, 1960), (30, 1980)),
),
(
((1, 2), (1, 4), (3, 4), (3, 2)),
Orientation.SCREEN,
Orientation.CARTESIAN,
((10, 1980), (10, 1960), (30, 1960), (30, 1980)),
),
(
((1, 2), (1, 4), (3, 4), (3, 2)),
Orientation.SCREEN,
Orientation.SCREEN,
((10, 20), (10, 40), (30, 40), (30, 20)),
),
],
)
def test_convert_coordinates_to_new_system(
coordinates: Points,
orientation1: Orientation,
orientation2: Orientation,
expected_coords: Points,
):
coord1 = CoordinateSystem(100, 200)
coord1.orientation = orientation1
coord2 = CoordinateSystem(1000, 2000)
coord2.orientation = orientation2
element = Element(coordinates=coordinates, coordinate_system=coord1)
new_coords = element.convert_coordinates_to_new_system(coord2)
assert new_coords is not None
for new_coord, expected in zip(new_coords, expected_coords):
assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType]
element.convert_coordinates_to_new_system(coord2, in_place=True)
assert element.metadata.coordinates is not None
assert element.metadata.coordinates.points is not None
for new_coord, expected in zip(element.metadata.coordinates.points, expected_coords):
assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType]
assert element.metadata.coordinates.system == coord2
def test_convert_coordinate_to_new_system_none():
element = Element(coordinates=None, coordinate_system=None)
coord = CoordinateSystem(100, 200)
coord.orientation = Orientation.SCREEN
assert element.convert_coordinates_to_new_system(coord) is None
def test_element_constructor_coordinates_all_present():
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
coordinate_system = RelativeCoordinateSystem()
element = Element(coordinates=coordinates, coordinate_system=coordinate_system)
expected_coordinates_metadata = CoordinatesMetadata(
points=coordinates,
system=coordinate_system,
)
assert element.metadata.coordinates == expected_coordinates_metadata
def test_element_constructor_coordinates_points_absent():
with pytest.raises(ValueError) as exc_info:
Element(coordinate_system=RelativeCoordinateSystem())
assert (
str(exc_info.value)
== "Coordinates points should not exist without coordinates system and vice versa."
)
def test_element_constructor_coordinates_system_absent():
with pytest.raises(ValueError) as exc_info:
Element(coordinates=((1, 2), (1, 4), (3, 4), (3, 2)))
assert (
str(exc_info.value)
== "Coordinates points should not exist without coordinates system and vice versa."
)
def test_coordinate_metadata_serdes():
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
coordinate_system = RelativeCoordinateSystem()
coordinates_metadata = CoordinatesMetadata(points=coordinates, system=coordinate_system)
expected_schema = {
"layout_height": 1,
"layout_width": 1,
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
"system": "RelativeCoordinateSystem",
}
coordinates_metadata_dict = coordinates_metadata.to_dict()
assert coordinates_metadata_dict == expected_schema
assert CoordinatesMetadata.from_dict(coordinates_metadata_dict) == coordinates_metadata
def test_element_to_dict():
coordinates = ((1, 2), (1, 4), (3, 4), (3, 2))
coordinate_system = RelativeCoordinateSystem()
element = Element(
element_id="awt32t1",
coordinates=coordinates,
coordinate_system=coordinate_system,
)
assert element.to_dict() == {
"metadata": {
"coordinates": {
"layout_height": 1,
"layout_width": 1,
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
"system": "RelativeCoordinateSystem",
},
},
"type": None,
"text": "",
"element_id": "awt32t1",
}
def test_regex_metadata_round_trips_through_JSON():
"""metadata.regex_metadata should appear at full depth in JSON."""
regex_metadata = {
"mail-stop": [RegexMetadata(text="MS-107", start=18, end=24)],
"version": [
RegexMetadata(text="current=v1.7.2", start=7, end=21),
RegexMetadata(text="supersedes=v1.7.2", start=22, end=40),
],
}
metadata = ElementMetadata(regex_metadata=regex_metadata)
metadata_json = json.dumps(metadata.to_dict())
deserialized_metadata = ElementMetadata.from_dict(json.loads(metadata_json))
reserialized_metadata_json = json.dumps(deserialized_metadata.to_dict())
assert reserialized_metadata_json == metadata_json
class DescribeElementMetadata:
"""Unit-test suite for `unstructured.documents.elements.ElementMetadata`."""
# -- It can be constructed with known keyword arguments. In particular, including a non-known
# -- keyword argument produces a type-error at development time and raises an exception at
# -- runtime. This catches typos before they reach production.
def it_detects_unknown_constructor_args_at_both_development_time_and_runtime(self):
with pytest.raises(TypeError, match="got an unexpected keyword argument 'file_name'"):
ElementMetadata(file_name="memo.docx") # pyright: ignore[reportCallIssue]
@pytest.mark.parametrize(
"file_path",
[
pathlib.Path("documents/docx") / "memos" / "memo-2023-11-10.docx",
"documents/docx/memos/memo-2023-11-10.docx",
],
)
def it_accommodates_either_a_pathlib_Path_or_str_for_its_filename_arg(
self, file_path: pathlib.Path | str
):
meta = ElementMetadata(filename=file_path)
assert meta.file_directory == "documents/docx/memos"
assert meta.filename == "memo-2023-11-10.docx"
def it_leaves_both_filename_and_file_directory_None_when_neither_is_specified(self):
meta = ElementMetadata()
assert meta.file_directory is None
assert meta.filename is None
@pytest.mark.parametrize("file_path", [pathlib.Path("memo.docx"), "memo.docx"])
def and_it_leaves_file_directory_None_when_not_specified_and_filename_is_not_a_path(
self, file_path: pathlib.Path | str
):
meta = ElementMetadata(filename=file_path)
assert meta.file_directory is None
assert meta.filename == "memo.docx"
def and_it_splits_off_directory_path_from_its_filename_arg_when_it_is_a_file_path(self):
meta = ElementMetadata(filename="documents/docx/memo-2023-11-11.docx")
assert meta.file_directory == "documents/docx"
assert meta.filename == "memo-2023-11-11.docx"
def but_it_prefers_a_specified_file_directory_when_filename_also_contains_a_path(self):
meta = ElementMetadata(filename="tmp/staging/memo.docx", file_directory="documents/docx")
assert meta.file_directory == "documents/docx"
assert meta.filename == "memo.docx"
# -- It knows the types of its known members so type-checking support is available. --
def it_knows_the_types_of_its_known_members_so_type_checking_support_is_available(self):
ElementMetadata(
category_depth="2", # pyright: ignore[reportArgumentType]
file_directory=True, # pyright: ignore[reportArgumentType]
text_as_html=42, # pyright: ignore[reportArgumentType]
)
# -- it does not check types at runtime however (choosing to avoid validation overhead) --
# -- It only stores a field's value when it is not None. --
def it_returns_the_value_of_an_attribute_it_has(self):
meta = ElementMetadata(url="https://google.com")
assert "url" in meta.__dict__
assert meta.url == "https://google.com"
def and_it_returns_None_for_a_known_attribute_it_does_not_have(self):
meta = ElementMetadata()
assert "url" not in meta.__dict__
assert meta.url is None
def but_it_raises_AttributeError_for_an_unknown_attribute_it_does_not_have(self):
meta = ElementMetadata()
assert "coefficient" not in meta.__dict__
with pytest.raises(AttributeError, match="object has no attribute 'coefficient'"):
meta.coefficient
def it_stores_a_non_None_field_value_when_assigned(self):
meta = ElementMetadata()
assert "file_directory" not in meta.__dict__
meta.file_directory = "tmp/"
assert "file_directory" in meta.__dict__
assert meta.file_directory == "tmp/"
def it_removes_a_field_when_None_is_assigned_to_it(self):
meta = ElementMetadata(file_directory="tmp/")
assert "file_directory" in meta.__dict__
assert meta.file_directory == "tmp/"
meta.file_directory = None
assert "file_directory" not in meta.__dict__
assert meta.file_directory is None
# -- It can serialize itself to a dict -------------------------------------------------------
def it_can_serialize_itself_to_a_dict(self):
meta = ElementMetadata(
category_depth=1,
file_directory="tmp/",
page_number=2,
text_as_html="<table></table>",
url="https://google.com",
)
assert meta.to_dict() == {
"category_depth": 1,
"file_directory": "tmp/",
"page_number": 2,
"text_as_html": "<table></table>",
"url": "https://google.com",
}
def and_it_serializes_a_coordinates_sub_object_to_a_dict_when_it_is_present(self):
meta = ElementMetadata(
category_depth=1,
coordinates=CoordinatesMetadata(
points=((2, 2), (1, 4), (3, 4), (3, 2)),
system=RelativeCoordinateSystem(),
),
page_number=2,
)
assert meta.to_dict() == {
"category_depth": 1,
"coordinates": {
"layout_height": 1,
"layout_width": 1,
"points": ((2, 2), (1, 4), (3, 4), (3, 2)),
"system": "RelativeCoordinateSystem",
},
"page_number": 2,
}
def and_it_serializes_a_data_source_sub_object_to_a_dict_when_it_is_present(self):
meta = ElementMetadata(
category_depth=1,
data_source=DataSourceMetadata(
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
date_created="2023-11-09",
),
page_number=2,
)
assert meta.to_dict() == {
"category_depth": 1,
"data_source": {
"url": "https://www.nih.gov/about-nih/who-we-are/nih-director",
"date_created": "2023-11-09",
},
"page_number": 2,
}
def and_it_serializes_an_orig_elements_sub_object_to_base64_when_it_is_present(self):
meta = ElementMetadata(
category_depth=1,
orig_elements=[Title("Lorem"), Text("Lorem Ipsum")],
page_number=2,
)
assert meta.to_dict() == {
"category_depth": 1,
"orig_elements": (
"eJyFzcsKwjAQheFXKVm7yDS3xjcQXNaViKTJjBR6o46glr67zVI3Lmf4Dv95EdhhjwNf2yT2hYDGUaWt"
"JVm5WDoqNUL0UoJrqtLHJHaF6JFDChw2v6zbzfjkvD2OM/YZ8GvC/Khb7lBs5LcilUwRyCsblQYTiBQp"
"ZRxYZcCA/1spDtP98dU6DTEw3sa5fWOqs10vH0cLQn0="
),
"page_number": 2,
}
def but_unlike_in_ElementMetadata_unknown_fields_in_sub_objects_are_ignored(self):
"""Metadata sub-objects ignore fields they do not explicitly define.
This is _not_ the case for ElementMetadata itself where an non-known field is welcomed as a
user-defined ad-hoc metadata field.
"""
element_metadata = {
"new_field": "hello",
"data_source": {
"new_field": "world",
},
"coordinates": {
"new_field": "foo",
},
}
metadata = ElementMetadata.from_dict(element_metadata)
metadata_dict = metadata.to_dict()
assert "new_field" in metadata_dict
assert "new_field" not in metadata_dict["coordinates"]
assert "new_field" not in metadata_dict["data_source"]
# -- It can deserialize itself from a dict ---------------------------------------------------
def it_can_deserialize_itself_from_a_dict(self):
meta_dict = {
"category_depth": 1,
"coefficient": 0.58,
"coordinates": {
"layout_height": 4,
"layout_width": 2,
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
"system": "RelativeCoordinateSystem",
},
"data_source": {
"url": "https://www.nih.gov/about-nih/who-we-are/nih-director",
"date_created": "2023-11-09",
},
"languages": ["eng"],
}
meta = ElementMetadata.from_dict(meta_dict)
# -- known fields present in dict are present in meta --
assert meta.category_depth == 1
# -- known sub-object fields present in dict are present in meta --
assert meta.coordinates == CoordinatesMetadata(
points=((1, 2), (1, 4), (3, 4), (3, 2)),
system=RelativeCoordinateSystem(),
)
assert meta.data_source == DataSourceMetadata(
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
date_created="2023-11-09",
)
# -- known fields absent from dict report None but are not present in meta --
assert meta.file_directory is None
assert "file_directory" not in meta.__dict__
# -- non-known fields present in dict are present in meta (we have no way to tell whether
# -- they are "ad-hoc" or not because we lack indication of user-intent)
assert meta.coefficient == 0.58
# -- ad-hoc fields absent from dict raise on attempted access --
with pytest.raises(AttributeError, match="ntMetadata' object has no attribute 'quotient'"):
meta.quotient
# -- but that can be worked around by end-user --
assert (meta.quotient if hasattr(meta, "quotient") else None) is None
# -- mutating a mutable (collection) field does not affect the original value --
assert isinstance(meta.languages, list)
assert meta.languages == ["eng"]
meta.languages.append("spa")
assert meta.languages == ["eng", "spa"]
assert meta_dict["languages"] == ["eng"]
# -- It allows downstream users to add an arbitrary new member by assignment. ----------------
def it_allows_an_end_user_to_add_an_arbitrary_field(self):
meta = ElementMetadata()
meta.foobar = 7
assert "foobar" in meta.__dict__
assert meta.foobar == 7
def and_fields_so_added_appear_in_the_metadata_JSON(self):
meta = ElementMetadata()
meta.foobar = 7
assert meta.to_dict() == {"foobar": 7}
def and_it_removes_an_end_user_field_when_it_is_assigned_None(self):
meta = ElementMetadata()
meta.foobar = 7
assert "foobar" in meta.__dict__
meta.foobar = None
assert "foobar" not in meta.__dict__
with pytest.raises(
AttributeError, match="'ElementMetadata' object has no attribute 'foobar'"
):
meta.foobar
# -- It can update itself from another instance ----------------------------------------------
def it_can_update_itself_from_another_instance(self):
meta = ElementMetadata(category_depth=1, page_number=1)
meta.coefficient = 0.58
meta.stem_length = 18
other = ElementMetadata(file_directory="tmp/", page_number=2)
other.quotient = 1.4
other.stem_length = 20
meta.update(other)
# -- known-fields present on self but not other are unchanged --
assert meta.category_depth == 1
# -- known-fields present on other but not self are added --
assert meta.file_directory == "tmp/"
# -- known-fields present on both self and other are updated --
assert meta.page_number == 2
# -- ad-hoc-fields present on self but not other are unchanged --
assert meta.coefficient == 0.58
# -- ad-hoc-fields present on other but not self are added --
assert meta.quotient == 1.4
# -- ad-hoc-fields present on both self and other are updated --
assert meta.stem_length == 20
# -- other is left unchanged --
assert other.category_depth is None
assert other.file_directory == "tmp/"
assert other.page_number == 2
assert other.text_as_html is None
assert other.url is None
assert other.quotient == 1.4
assert other.stem_length == 20
with pytest.raises(AttributeError, match="etadata' object has no attribute 'coefficient'"):
other.coefficient
def but_it_raises_on_attempt_to_update_from_a_non_ElementMetadata_object(self):
meta = ElementMetadata()
with pytest.raises(ValueError, match=r"ate\(\)' must be an instance of 'ElementMetadata'"):
meta.update({"coefficient": "0.56"}) # pyright: ignore[reportArgumentType]
# -- It knows when it is equal to another instance -------------------------------------------
def it_is_equal_to_another_instance_with_the_same_known_field_values(self):
meta = ElementMetadata(
category_depth=1,
coordinates=CoordinatesMetadata(
points=((1, 2), (1, 4), (3, 4), (3, 2)),
system=RelativeCoordinateSystem(),
),
data_source=DataSourceMetadata(
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
date_created="2023-11-08",
),
file_directory="tmp/",
languages=["eng"],
page_number=2,
text_as_html="<table></table>",
url="https://google.com",
)
assert meta == ElementMetadata(
category_depth=1,
coordinates=CoordinatesMetadata(
points=((1, 2), (1, 4), (3, 4), (3, 2)),
system=RelativeCoordinateSystem(),
),
data_source=DataSourceMetadata(
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
date_created="2023-11-08",
),
file_directory="tmp/",
languages=["eng"],
page_number=2,
text_as_html="<table></table>",
url="https://google.com",
)
def but_it_is_never_equal_to_a_non_ElementMetadata_object(self):
class NotElementMetadata:
pass
meta = ElementMetadata()
other = NotElementMetadata()
# -- all the "fields" are the same --
assert meta.__dict__ == other.__dict__
# -- but it is rejected solely because its type is different --
assert meta != other
def it_is_equal_to_another_instance_with_the_same_ad_hoc_field_values(self):
meta = ElementMetadata(category_depth=1)
meta.coefficient = 0.58
other = ElementMetadata(category_depth=1)
other.coefficient = 0.58
assert meta == other
def but_it_is_not_equal_to_an_instance_with_ad_hoc_fields_that_differ(self):
meta = ElementMetadata(category_depth=1)
meta.coefficient = 0.58
other = ElementMetadata(category_depth=1)
other.coefficient = 0.72
assert meta != other
def it_is_not_equal_when_a_list_field_contains_different_items(self):
meta = ElementMetadata(languages=["eng"])
assert meta != ElementMetadata(languages=["eng", "spa"])
def and_it_is_not_equal_when_the_coordinates_sub_object_field_differs(self):
meta = ElementMetadata(
coordinates=CoordinatesMetadata(
points=((1, 2), (1, 4), (3, 4), (3, 2)),
system=RelativeCoordinateSystem(),
)
)
assert meta != ElementMetadata(
coordinates=CoordinatesMetadata(
points=((2, 2), (2, 4), (3, 4), (4, 2)),
system=RelativeCoordinateSystem(),
)
)
def and_it_is_not_equal_when_the_data_source_sub_object_field_differs(self):
meta = ElementMetadata(
data_source=DataSourceMetadata(
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
date_created="2023-11-08",
)
)
assert meta != ElementMetadata(
data_source=DataSourceMetadata(
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
date_created="2023-11-09",
)
)
# -- There is a consolidation-strategy for all known fields ----------------------------------
def it_can_find_the_consolidation_strategy_for_each_of_its_known_fields(self):
metadata = ElementMetadata()
metadata_field_names = sorted(metadata._known_field_names)
consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
for field_name in metadata_field_names:
assert field_name in consolidation_strategies, (
f"ElementMetadata field `.{field_name}` does not have a consolidation strategy."
f" Add one in `ConsolidationStrategy.field_consolidation_strategies()."
)