# pyright: reportPrivateUsage=false """Test-suite for `unstructured.documents.elements` module.""" from __future__ import annotations import json import pathlib from functools import partial import pytest from unstructured.cleaners.core import clean_prefix from unstructured.cleaners.translate import translate_text from unstructured.documents.coordinates import ( CoordinateSystem, Orientation, RelativeCoordinateSystem, ) from unstructured.documents.elements import ( UUID, ConsolidationStrategy, CoordinatesMetadata, DataSourceMetadata, Element, ElementMetadata, NoID, Points, RegexMetadata, Text, ) def test_text_id(): text_element = Text(text="hello there!") assert text_element.id == "c69509590d81db2f37f9d75480c8efed" def test_text_uuid(): text_element = Text(text="hello there!", element_id=UUID()) id = text_element.id assert isinstance(id, str) assert len(id) == 36 assert id.count("-") == 4 # -- Test that the element is JSON serializable. This shold run without an error -- json.dumps(text_element.to_dict()) def test_element_defaults_to_blank_id(): element = Element() assert isinstance(element.id, NoID) def test_element_uuid(): element = Element(element_id=UUID()) assert isinstance(element.id, UUID) def test_text_element_apply_cleaners(): text_element = Text(text="[1] A Textbook on Crocodile Habitats") text_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]")) assert str(text_element) == "A Textbook on Crocodile Habitats" def test_text_element_apply_multiple_cleaners(): cleaners = [ partial(clean_prefix, pattern=r"\[\d{1,2}\]"), partial(translate_text, target_lang="ru"), ] text_element = Text(text="[1] A Textbook on Crocodile Habitats") text_element.apply(*cleaners) assert str(text_element) == "Учебник по крокодильным средам обитания" def test_apply_raises_if_func_does_not_produce_string(): def bad_cleaner(s: str): return 1 text_element = Text(text="[1] A Textbook on Crocodile Habitats") with pytest.raises(ValueError, match="Cleaner produced a non-string output."): text_element.apply(bad_cleaner) # pyright: ignore[reportGeneralTypeIssues] @pytest.mark.parametrize( ("coordinates", "orientation1", "orientation2", "expected_coords"), [ ( ((1, 2), (1, 4), (3, 4), (3, 2)), Orientation.CARTESIAN, Orientation.CARTESIAN, ((10, 20), (10, 40), (30, 40), (30, 20)), ), ( ((1, 2), (1, 4), (3, 4), (3, 2)), Orientation.CARTESIAN, Orientation.SCREEN, ((10, 1980), (10, 1960), (30, 1960), (30, 1980)), ), ( ((1, 2), (1, 4), (3, 4), (3, 2)), Orientation.SCREEN, Orientation.CARTESIAN, ((10, 1980), (10, 1960), (30, 1960), (30, 1980)), ), ( ((1, 2), (1, 4), (3, 4), (3, 2)), Orientation.SCREEN, Orientation.SCREEN, ((10, 20), (10, 40), (30, 40), (30, 20)), ), ], ) def test_convert_coordinates_to_new_system( coordinates: Points, orientation1: Orientation, orientation2: Orientation, expected_coords: Points, ): coord1 = CoordinateSystem(100, 200) coord1.orientation = orientation1 coord2 = CoordinateSystem(1000, 2000) coord2.orientation = orientation2 element = Element(coordinates=coordinates, coordinate_system=coord1) new_coords = element.convert_coordinates_to_new_system(coord2) assert new_coords is not None for new_coord, expected in zip(new_coords, expected_coords): assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType] element.convert_coordinates_to_new_system(coord2, in_place=True) assert element.metadata.coordinates is not None assert element.metadata.coordinates.points is not None for new_coord, expected in zip(element.metadata.coordinates.points, expected_coords): assert new_coord == pytest.approx(expected) # pyright: ignore[reportUnknownMemberType] assert element.metadata.coordinates.system == coord2 def test_convert_coordinate_to_new_system_none(): element = Element(coordinates=None, coordinate_system=None) coord = CoordinateSystem(100, 200) coord.orientation = Orientation.SCREEN assert element.convert_coordinates_to_new_system(coord) is None def test_element_constructor_coordinates_all_present(): coordinates = ((1, 2), (1, 4), (3, 4), (3, 2)) coordinate_system = RelativeCoordinateSystem() element = Element(coordinates=coordinates, coordinate_system=coordinate_system) expected_coordinates_metadata = CoordinatesMetadata( points=coordinates, system=coordinate_system, ) assert element.metadata.coordinates == expected_coordinates_metadata def test_element_constructor_coordinates_points_absent(): with pytest.raises(ValueError) as exc_info: Element(coordinate_system=RelativeCoordinateSystem()) assert ( str(exc_info.value) == "Coordinates points should not exist without coordinates system and vice versa." ) def test_element_constructor_coordinates_system_absent(): with pytest.raises(ValueError) as exc_info: Element(coordinates=((1, 2), (1, 4), (3, 4), (3, 2))) assert ( str(exc_info.value) == "Coordinates points should not exist without coordinates system and vice versa." ) def test_coordinate_metadata_serdes(): coordinates = ((1, 2), (1, 4), (3, 4), (3, 2)) coordinate_system = RelativeCoordinateSystem() coordinates_metadata = CoordinatesMetadata(points=coordinates, system=coordinate_system) expected_schema = { "layout_height": 1, "layout_width": 1, "points": ((1, 2), (1, 4), (3, 4), (3, 2)), "system": "RelativeCoordinateSystem", } coordinates_metadata_dict = coordinates_metadata.to_dict() assert coordinates_metadata_dict == expected_schema assert CoordinatesMetadata.from_dict(coordinates_metadata_dict) == coordinates_metadata def test_element_to_dict(): coordinates = ((1, 2), (1, 4), (3, 4), (3, 2)) coordinate_system = RelativeCoordinateSystem() element = Element( element_id="awt32t1", coordinates=coordinates, coordinate_system=coordinate_system, ) assert element.to_dict() == { "metadata": { "coordinates": { "layout_height": 1, "layout_width": 1, "points": ((1, 2), (1, 4), (3, 4), (3, 2)), "system": "RelativeCoordinateSystem", }, }, "type": None, "text": "", "element_id": "awt32t1", } def test_regex_metadata_round_trips_through_JSON(): """metadata.regex_metadata should appear at full depth in JSON.""" regex_metadata = { "mail-stop": [RegexMetadata(text="MS-107", start=18, end=24)], "version": [ RegexMetadata(text="current=v1.7.2", start=7, end=21), RegexMetadata(text="supersedes=v1.7.2", start=22, end=40), ], } metadata = ElementMetadata(regex_metadata=regex_metadata) metadata_json = json.dumps(metadata.to_dict()) deserialized_metadata = ElementMetadata.from_dict(json.loads(metadata_json)) reserialized_metadata_json = json.dumps(deserialized_metadata.to_dict()) assert reserialized_metadata_json == metadata_json class DescribeElementMetadata: """Unit-test suite for `unstructured.documents.elements.ElementMetadata`.""" # -- It can be constructed with known keyword arguments. In particular, including a non-known # -- keyword argument produces a type-error at development time and raises an exception at # -- runtime. This catches typos before they reach production. def it_detects_unknown_constructor_args_at_both_development_time_and_runtime(self): with pytest.raises(TypeError, match="got an unexpected keyword argument 'file_name'"): ElementMetadata(file_name="memo.docx") # pyright: ignore[reportGeneralTypeIssues] @pytest.mark.parametrize( "file_path", [ pathlib.Path("documents/docx") / "memos" / "memo-2023-11-10.docx", "documents/docx/memos/memo-2023-11-10.docx", ], ) def it_accommodates_either_a_pathlib_Path_or_str_for_its_filename_arg( self, file_path: pathlib.Path | str ): meta = ElementMetadata(filename=file_path) assert meta.file_directory == "documents/docx/memos" assert meta.filename == "memo-2023-11-10.docx" def it_leaves_both_filename_and_file_directory_None_when_neither_is_specified(self): meta = ElementMetadata() assert meta.file_directory is None assert meta.filename is None @pytest.mark.parametrize("file_path", [pathlib.Path("memo.docx"), "memo.docx"]) def and_it_leaves_file_directory_None_when_not_specified_and_filename_is_not_a_path( self, file_path: pathlib.Path | str ): meta = ElementMetadata(filename=file_path) assert meta.file_directory is None assert meta.filename == "memo.docx" def and_it_splits_off_directory_path_from_its_filename_arg_when_it_is_a_file_path(self): meta = ElementMetadata(filename="documents/docx/memo-2023-11-11.docx") assert meta.file_directory == "documents/docx" assert meta.filename == "memo-2023-11-11.docx" def but_it_prefers_a_specified_file_directory_when_filename_also_contains_a_path(self): meta = ElementMetadata(filename="tmp/staging/memo.docx", file_directory="documents/docx") assert meta.file_directory == "documents/docx" assert meta.filename == "memo.docx" # -- It knows the types of its known members so type-checking support is available. -- def it_knows_the_types_of_its_known_members_so_type_checking_support_is_available(self): ElementMetadata( category_depth="2", # pyright: ignore[reportGeneralTypeIssues] file_directory=True, # pyright: ignore[reportGeneralTypeIssues] text_as_html=42, # pyright: ignore[reportGeneralTypeIssues] ) # -- it does not check types at runtime however (choosing to avoid validation overhead) -- # -- It only stores a field's value when it is not None. -- def it_returns_the_value_of_an_attribute_it_has(self): meta = ElementMetadata(url="https://google.com") assert "url" in meta.__dict__ assert meta.url == "https://google.com" def and_it_returns_None_for_a_known_attribute_it_does_not_have(self): meta = ElementMetadata() assert "url" not in meta.__dict__ assert meta.url is None def but_it_raises_AttributeError_for_an_unknown_attribute_it_does_not_have(self): meta = ElementMetadata() assert "coefficient" not in meta.__dict__ with pytest.raises(AttributeError, match="object has no attribute 'coefficient'"): meta.coefficient def it_stores_a_non_None_field_value_when_assigned(self): meta = ElementMetadata() assert "file_directory" not in meta.__dict__ meta.file_directory = "tmp/" assert "file_directory" in meta.__dict__ assert meta.file_directory == "tmp/" def it_removes_a_field_when_None_is_assigned_to_it(self): meta = ElementMetadata(file_directory="tmp/") assert "file_directory" in meta.__dict__ assert meta.file_directory == "tmp/" meta.file_directory = None assert "file_directory" not in meta.__dict__ assert meta.file_directory is None # -- It can serialize itself to a dict ------------------------------------------------------- def it_can_serialize_itself_to_a_dict(self): meta = ElementMetadata( category_depth=1, file_directory="tmp/", page_number=2, text_as_html="