2023-10-23 11:51:52 -04:00
|
|
|
from unstructured.ingest.connector.local import LocalIngestDoc, SimpleLocalConfig
|
|
|
|
from unstructured.ingest.connector.registry import (
|
|
|
|
create_ingest_doc_from_dict,
|
|
|
|
create_ingest_doc_from_json,
|
|
|
|
)
|
|
|
|
from unstructured.ingest.interfaces import ProcessorConfig, ReadConfig
|
|
|
|
|
|
|
|
doc = LocalIngestDoc(
|
|
|
|
path="test_unstructured_ingest/example-docs/layout-parser-paper.pdf",
|
|
|
|
connector_config=SimpleLocalConfig(input_path="test_unstructured_ingest/example-docs/"),
|
|
|
|
processor_config=ProcessorConfig(),
|
|
|
|
read_config=ReadConfig(),
|
|
|
|
)
|
|
|
|
doc.update_source_metadata()
|
|
|
|
serialized_json = doc.to_json()
|
2023-10-25 18:04:27 -04:00
|
|
|
serialized_dict = doc.to_dict()
|
2023-10-23 11:51:52 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_manual_deserialization():
|
|
|
|
deserialized_doc = LocalIngestDoc.from_json(serialized_json)
|
|
|
|
assert doc == deserialized_doc
|
|
|
|
|
|
|
|
|
|
|
|
def test_registry_from_json():
|
|
|
|
deserialized_doc = create_ingest_doc_from_json(serialized_json)
|
|
|
|
assert doc == deserialized_doc
|
|
|
|
|
|
|
|
|
|
|
|
def test_registry_from_dict():
|
|
|
|
deserialized_doc = create_ingest_doc_from_dict(serialized_dict)
|
|
|
|
assert doc == deserialized_doc
|
|
|
|
|
|
|
|
|
|
|
|
def test_source_metadata_serialization():
|
|
|
|
doc = LocalIngestDoc(
|
|
|
|
path="test_unstructured_ingest/example-docs/layout-parser-paper.pdf",
|
|
|
|
connector_config=SimpleLocalConfig(input_path="test_unstructured_ingest/example-docs/"),
|
|
|
|
processor_config=ProcessorConfig(),
|
|
|
|
read_config=ReadConfig(),
|
|
|
|
)
|
|
|
|
serialized_json = doc.to_dict()
|
|
|
|
assert not serialized_json["_source_metadata"]
|
|
|
|
|
|
|
|
doc.update_source_metadata()
|
|
|
|
serialized_json_w_meta = doc.to_dict()
|
|
|
|
assert serialized_json_w_meta["_source_metadata"]
|