mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

Each partitioner has a test like `test_partition_x_with_json()`. What these do is serialize the elements produced by the partitioner to JSON, then read them back in from JSON and compare the before and after elements. Because our element equality (`Element.__eq__()`) is shallow, this doesn't tell us a lot, but if we take it one more step, like `List[Element] -> JSON -> List[Element] -> JSON` and then compare the JSON, it gives us some confidence that the serialized elements can be "re-hydrated" without losing any information. This actually showed up a few problems, all in the serialization/deserialization (serde) code that all elements share.
39 lines
1.3 KiB
Python
39 lines
1.3 KiB
Python
"""Utilities that ease unit-testing."""
|
|
|
|
import pathlib
|
|
from typing import List
|
|
|
|
from unstructured.documents.elements import Element
|
|
from unstructured.staging.base import elements_from_json, elements_to_json
|
|
|
|
|
|
def assert_round_trips_through_JSON(elements: List[Element]) -> None:
|
|
"""Raises AssertionError if `elements -> JSON -> List[Element] -> JSON` are not equal.
|
|
|
|
The procedure is:
|
|
|
|
1. Serialize `elements` to (original) JSON.
|
|
2. Deserialize that JSON to `List[Element]`.
|
|
3. Serialize that `List[Element]` to JSON.
|
|
3. Compare the original and round-tripped JSON, raise if they are different.
|
|
|
|
"""
|
|
original_json = elements_to_json(elements)
|
|
assert original_json is not None
|
|
|
|
round_tripped_elements = elements_from_json(text=original_json)
|
|
|
|
round_tripped_json = elements_to_json(round_tripped_elements)
|
|
assert round_tripped_json is not None
|
|
|
|
assert (
|
|
round_tripped_json == original_json
|
|
), f"JSON differs, expected\n{original_json},\ngot\n{round_tripped_json}\n"
|
|
|
|
|
|
def example_doc_path(file_name: str) -> str:
|
|
"""Resolve the absolute-path to `file_name` in the example-docs directory."""
|
|
example_docs_dir = pathlib.Path(__file__).parent.parent / "example-docs"
|
|
file_path = example_docs_dir / file_name
|
|
return str(file_path.resolve())
|