mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 07:05:20 +00:00

### Description As we add more and more steps to the pipeline (i.e. chunking, embedding, table manipulation), it would help seperate the responsibility of each of these into their own processes, running each in parallel using json files to share data across. This will also help guarantee data is serializable if this code was used in an actual pipeline. Following is a flow diagram of the proposed changes. As part of this change: * A parent pipeline class will be responsible for running each `node`, which can optionally be run via multiprocessing if it supports it, or not. Possible nodes at this moment: * Doc factory: creates all the ingest docs via the source connector * Source: reads/downloads all of the content to process to the local filesystem to the location set by the `download_dir` parameter. * Partition: runs partition on all of the downloaded content in json format. * Any number of reformat nodes that modify the partitioned content. This can include chunking, embedding, etc. * Write: push the final json into the destination via the destination connector * This pipeline relies on the information of the ingest docs to be available via their serialization. An optimization was introduced with the `IngestDocJsonMixin` which adds in all the `@property` fields to the serialized json already being created via the `DataClassJsonMixin` * For all intermediate steps (partitioning, reformatting), the content is saved to a dedicated location on the local filesystem. Right now it's set to `$HOME/.cache/unstructured/ingest/pipeline/STEP_NAME/`. * Minor changes: made sense to move some of the config parameters between the read and partition configs when I explicitly divided the responsibility to download vs partition the content in the pipeline. * The pipeline class only makes the doc factory, source and partition nodes required, keeping with the logic that has been supported so far. All reformatting nodes and write node are optional. * Long term, there should also be some changes to the base configs supported by the CLI to support pipeline specific configs, but for now what exists was used to minimize changes in this PR. * Final step to copy the final output to the location designated by the `_output_filename` value of the ingest doc. * Hashing occurs at each step by hashing the parameters of that step (i.e. partition configs) along with the previous step via the filename used. This allows each step to be the same _if_ all the parameters for it have not changed and the content so far is the same. * The only data that is shared and has writes to across processes is the dictionary of ingest json data. This dict is created using the `multiprocessing.manager.DictProxy` to make sure any interaction with it is behind a lock. ### Minor refactors included: * Utility methods added to extract configs from the click options * Utility method to add common options to click commands. * All writers moved to using the class approach which extracts a lot of the common code so there's less copy-paste when new runners are added. * Use `@property` for source metadata on base ingest doc to add logic to call `update_source_metadata` if it's still `None` at the time it's fetched. ### Additional bug fixes included * Fsspec connectors were not serializable due to the `ingest_doc_cls`. This was removed from the fields captured by the `@dataclass` decorator and added in a `__post_init__` method. * Various reddit connector params were missing. This doesn't have an explicit ingest test at the moment so was never caught. * Fsspec connector had the parent `update_source_metadata` misnamed as `update_source_metadata_metadata` so it was never being called. ### Flow Diagram 
258 lines
8.4 KiB
Python
258 lines
8.4 KiB
Python
import os
|
|
import pathlib
|
|
from dataclasses import dataclass
|
|
from typing import Any, Dict
|
|
|
|
import pytest
|
|
|
|
from unstructured.documents.elements import DataSourceMetadata
|
|
from unstructured.ingest.interfaces import (
|
|
BaseConnectorConfig,
|
|
BaseIngestDoc,
|
|
PartitionConfig,
|
|
ProcessorConfig,
|
|
ReadConfig,
|
|
)
|
|
from unstructured.partition.auto import partition
|
|
from unstructured.staging.base import convert_to_dict
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "../..", "example-docs")
|
|
TEST_DOWNLOAD_DIR = "/tmp"
|
|
TEST_OUTPUT_DIR = "/tmp"
|
|
TEST_ID = "test"
|
|
TEST_FILE_PATH = os.path.join(EXAMPLE_DOCS_DIRECTORY, "book-war-and-peace-1p.txt")
|
|
|
|
|
|
@dataclass
|
|
class TestConfig(BaseConnectorConfig):
|
|
id: str
|
|
path: str
|
|
|
|
|
|
TEST_CONFIG = TestConfig(id=TEST_ID, path=TEST_FILE_PATH)
|
|
TEST_SOURCE_URL = "test-source-url"
|
|
TEST_VERSION = "1.1.1"
|
|
TEST_RECORD_LOCATOR = {"id": "data-source-id"}
|
|
TEST_DATE_CREATED = "2021-01-01T00:00:00"
|
|
TEST_DATE_MODIFIED = "2021-01-02T00:00:00"
|
|
TEST_DATE_PROCESSSED = "2022-12-13T15:44:08"
|
|
|
|
|
|
@dataclass
|
|
class TestIngestDoc(BaseIngestDoc):
|
|
connector_config: TestConfig
|
|
|
|
@property
|
|
def filename(self):
|
|
return TEST_FILE_PATH
|
|
|
|
@property
|
|
def _output_filename(self):
|
|
return TEST_FILE_PATH + ".json"
|
|
|
|
@property
|
|
def source_url(self) -> str:
|
|
return TEST_SOURCE_URL
|
|
|
|
@property
|
|
def version(self) -> str:
|
|
return TEST_VERSION
|
|
|
|
@property
|
|
def record_locator(self) -> Dict[str, Any]:
|
|
return TEST_RECORD_LOCATOR
|
|
|
|
@property
|
|
def date_created(self) -> str:
|
|
return TEST_DATE_CREATED
|
|
|
|
@property
|
|
def date_modified(self) -> str:
|
|
return TEST_DATE_MODIFIED
|
|
|
|
@property
|
|
def exists(self) -> bool:
|
|
return True
|
|
|
|
def cleanup_file(self):
|
|
pass
|
|
|
|
def get_file(self):
|
|
pass
|
|
|
|
def has_output(self):
|
|
return True
|
|
|
|
def write_result(self, result):
|
|
pass
|
|
|
|
|
|
@pytest.fixture()
|
|
def partition_test_results():
|
|
# Reusable partition test results, calculated only once
|
|
result = partition(
|
|
filename=str(TEST_FILE_PATH),
|
|
data_source_metadata=DataSourceMetadata(
|
|
url=TEST_SOURCE_URL,
|
|
version=TEST_VERSION,
|
|
record_locator=TEST_RECORD_LOCATOR,
|
|
date_created=TEST_DATE_CREATED,
|
|
date_modified=TEST_DATE_MODIFIED,
|
|
date_processed=TEST_DATE_PROCESSSED,
|
|
),
|
|
)
|
|
return result
|
|
|
|
|
|
@pytest.fixture()
|
|
def partition_file_test_results(partition_test_results):
|
|
# Reusable partition_file test results, calculated only once
|
|
return convert_to_dict(partition_test_results)
|
|
|
|
|
|
def test_partition_file():
|
|
"""Validate partition_file returns a list of dictionaries with the expected keys,
|
|
metadatakeys, and data source metadata values."""
|
|
test_ingest_doc = TestIngestDoc(
|
|
connector_config=TEST_CONFIG,
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
|
)
|
|
test_ingest_doc._date_processed = TEST_DATE_PROCESSSED
|
|
isd_elems_raw = test_ingest_doc.partition_file(partition_config=PartitionConfig())
|
|
isd_elems = convert_to_dict(isd_elems_raw)
|
|
assert len(isd_elems)
|
|
expected_keys = {
|
|
"element_id",
|
|
"text",
|
|
"type",
|
|
"metadata",
|
|
}
|
|
# The document in TEST_FILE_PATH does not have elements with coordinates so
|
|
# partition is not expected to return coordinates metadata.
|
|
expected_metadata_keys = {
|
|
"data_source",
|
|
"filename",
|
|
"file_directory",
|
|
"filetype",
|
|
"languages",
|
|
"last_modified",
|
|
}
|
|
for elem in isd_elems:
|
|
# Parent IDs are non-deterministic - remove them from the test
|
|
elem["metadata"].pop("parent_id", None)
|
|
|
|
assert expected_keys == set(elem.keys())
|
|
assert expected_metadata_keys == set(elem["metadata"].keys())
|
|
data_source_metadata = elem["metadata"]["data_source"]
|
|
assert data_source_metadata["url"] == TEST_SOURCE_URL
|
|
assert data_source_metadata["version"] == TEST_VERSION
|
|
assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
|
|
assert data_source_metadata["date_created"] == TEST_DATE_CREATED
|
|
assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
|
|
assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSSED
|
|
|
|
|
|
def test_process_file_fields_include_default(mocker, partition_test_results):
|
|
"""Validate when metadata_include and metadata_exclude are not set, all fields:
|
|
("element_id", "text", "type", "metadata") are included"""
|
|
mock_partition = mocker.patch(
|
|
"unstructured.ingest.interfaces.partition",
|
|
return_value=partition_test_results,
|
|
)
|
|
test_ingest_doc = TestIngestDoc(
|
|
connector_config=TEST_CONFIG,
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
|
)
|
|
isd_elems_raw = test_ingest_doc.partition_file(partition_config=PartitionConfig())
|
|
isd_elems = convert_to_dict(isd_elems_raw)
|
|
assert len(isd_elems)
|
|
assert mock_partition.call_count == 1
|
|
for elem in isd_elems:
|
|
# Parent IDs are non-deterministic - remove them from the test
|
|
elem["metadata"].pop("parent_id", None)
|
|
|
|
assert {"element_id", "text", "type", "metadata"} == set(elem.keys())
|
|
data_source_metadata = elem["metadata"]["data_source"]
|
|
assert data_source_metadata["url"] == TEST_SOURCE_URL
|
|
assert data_source_metadata["version"] == TEST_VERSION
|
|
assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
|
|
assert data_source_metadata["date_created"] == TEST_DATE_CREATED
|
|
assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
|
|
assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSSED
|
|
|
|
|
|
def test_process_file_metadata_includes_filename_and_filetype(
|
|
mocker,
|
|
partition_test_results,
|
|
):
|
|
"""Validate when metadata_include is set to "filename,filetype",
|
|
only filename is included in metadata"""
|
|
mocker.patch(
|
|
"unstructured.ingest.interfaces.partition",
|
|
return_value=partition_test_results,
|
|
)
|
|
partition_config = PartitionConfig(
|
|
metadata_include=["filename", "filetype"],
|
|
)
|
|
test_ingest_doc = TestIngestDoc(
|
|
connector_config=TEST_CONFIG,
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
|
)
|
|
isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
|
|
assert len(isd_elems)
|
|
for elem in isd_elems:
|
|
# Parent IDs are non-deterministic - remove them from the test
|
|
elem["metadata"].pop("parent_id", None)
|
|
|
|
assert set(elem["metadata"].keys()) == {"filename", "filetype"}
|
|
|
|
|
|
def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_results):
|
|
"""Validate when metadata_exclude is set to "filename,page_number",
|
|
neither filename nor page_number are included in metadata"""
|
|
mocker.patch(
|
|
"unstructured.ingest.interfaces.partition",
|
|
return_value=partition_test_results,
|
|
)
|
|
partition_config = PartitionConfig(
|
|
metadata_exclude=["filename", "page_number"],
|
|
)
|
|
test_ingest_doc = TestIngestDoc(
|
|
connector_config=TEST_CONFIG,
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
processor_config=ProcessorConfig(
|
|
output_dir=TEST_OUTPUT_DIR,
|
|
),
|
|
)
|
|
isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
|
|
assert len(isd_elems)
|
|
for elem in isd_elems:
|
|
assert "filename" not in elem["metadata"]
|
|
assert "page_number" not in elem["metadata"]
|
|
|
|
|
|
def test_process_file_flatten_metadata(mocker, partition_test_results):
|
|
mocker.patch(
|
|
"unstructured.ingest.interfaces.partition",
|
|
return_value=partition_test_results,
|
|
)
|
|
partition_config = PartitionConfig(
|
|
metadata_include=["filename", "data_source"],
|
|
flatten_metadata=True,
|
|
)
|
|
test_ingest_doc = TestIngestDoc(
|
|
connector_config=TEST_CONFIG,
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
processor_config=ProcessorConfig(
|
|
output_dir=TEST_OUTPUT_DIR,
|
|
),
|
|
)
|
|
isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
|
|
expected_keys = {"element_id", "text", "type", "filename", "data_source"}
|
|
for elem in isd_elems:
|
|
assert expected_keys == set(elem.keys())
|