John 3843af666e
feat: Enable remote chunking via unstructured-ingest (#2905)
Update: The cli shell script works when sending documents to the free
api, but the paid api is down, so waiting to test against it.

- The first commit adds docstrings and fixes type hints.
- The second commit reorganizes `test_unstructured_ingest` so it matches
the structure of `unstructured/ingest`.
- The third commit contains the primary changes for this PR.
- The `.chunk()` method responsible for sending elements to the correct
method is moved from `ChunkingConfig` to `Chunker` so that
`ChunkingConfig` acts as a config object instead of containing
implementation logic. `Chunker.chunk()` also now takes a json file
instead of a list of elements. This is done to avoid redundant
serialization if the file is to be sent to the api for chunking.

---------

Co-authored-by: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com>
2024-04-25 00:24:58 +00:00

47 lines
1.5 KiB
Python

from unstructured.ingest.connector.local import LocalIngestDoc, SimpleLocalConfig
from unstructured.ingest.connector.registry import (
create_ingest_doc_from_dict,
create_ingest_doc_from_json,
)
from unstructured.ingest.interfaces import ProcessorConfig, ReadConfig
doc = LocalIngestDoc(
path="test_unstructured_ingest/example-docs/layout-parser-paper.pdf",
connector_config=SimpleLocalConfig(input_path="test_unstructured_ingest/example-docs/"),
processor_config=ProcessorConfig(),
read_config=ReadConfig(),
)
doc.update_source_metadata()
serialized_json = doc.to_json()
serialized_dict = doc.to_dict()
def test_manual_deserialization():
deserialized_doc = LocalIngestDoc.from_json(serialized_json)
assert doc == deserialized_doc
def test_registry_from_json():
deserialized_doc = create_ingest_doc_from_json(serialized_json)
assert doc == deserialized_doc
def test_registry_from_dict():
deserialized_doc = create_ingest_doc_from_dict(serialized_dict)
assert doc == deserialized_doc
def test_source_metadata_serialization():
doc = LocalIngestDoc(
path="test_unstructured_ingest/example-docs/layout-parser-paper.pdf",
connector_config=SimpleLocalConfig(input_path="test_unstructured_ingest/example-docs/"),
processor_config=ProcessorConfig(),
read_config=ReadConfig(),
)
serialized_json = doc.to_dict()
assert not serialized_json["_source_metadata"]
doc.update_source_metadata()
serialized_json_w_meta = doc.to_dict()
assert serialized_json_w_meta["_source_metadata"]