Roman Isecke a2af72bb79
local connector metadata and deserialization fix (#1800)
### Description
* Priority of this was to fix deserialization of ingest docs. Currently
the source metadata wasn't being persisted
* To help debug this, source metadata was added to the local ingest doc
as well.
* Unit test added to make sure the metadata itself was persisted.
* As part of serialization, it was forcing docs to fetch source metadata
if it hadn't already to add to the generated dict/json. This shouldn't
have happened if the underlying variable `_source_metadata` was `None`.
This way the doc can be serialized without any calls being made.
* Serialization was moved to the `to_dict` method to make it more
universal.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com>
2023-10-23 15:51:52 +00:00

49 lines
1.6 KiB
Python

import json
from unstructured.ingest.connector.local import LocalIngestDoc, SimpleLocalConfig
from unstructured.ingest.connector.registry import (
create_ingest_doc_from_dict,
create_ingest_doc_from_json,
)
from unstructured.ingest.interfaces import ProcessorConfig, ReadConfig
doc = LocalIngestDoc(
path="test_unstructured_ingest/example-docs/layout-parser-paper.pdf",
connector_config=SimpleLocalConfig(input_path="test_unstructured_ingest/example-docs/"),
processor_config=ProcessorConfig(),
read_config=ReadConfig(),
)
doc.update_source_metadata()
serialized_json = doc.to_json()
def test_manual_deserialization():
deserialized_doc = LocalIngestDoc.from_json(serialized_json)
assert doc == deserialized_doc
def test_registry_from_json():
deserialized_doc = create_ingest_doc_from_json(serialized_json)
assert doc == deserialized_doc
def test_registry_from_dict():
serialized_dict: dict = json.loads(serialized_json)
deserialized_doc = create_ingest_doc_from_dict(serialized_dict)
assert doc == deserialized_doc
def test_source_metadata_serialization():
doc = LocalIngestDoc(
path="test_unstructured_ingest/example-docs/layout-parser-paper.pdf",
connector_config=SimpleLocalConfig(input_path="test_unstructured_ingest/example-docs/"),
processor_config=ProcessorConfig(),
read_config=ReadConfig(),
)
serialized_json = doc.to_dict()
assert not serialized_json["_source_metadata"]
doc.update_source_metadata()
serialized_json_w_meta = doc.to_dict()
assert serialized_json_w_meta["_source_metadata"]