mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-08 09:33:43 +00:00

### Description All other connectors that were not included in https://github.com/Unstructured-IO/unstructured/pull/2194 are now updated to follow the new pattern and mark any variables as sensitive where it makes sense. Core changes: * All connectors now support an `AccessConfig` to mark data that's needed for auth (i.e. username, password) and those that are sensitive are designated appropriately using the new enhanced field. * All cli configs on the cli definition now inherit from the base config in the connector file to reuse the variables set on that dataclass * The base writer class was updated to better generalize the new approach given better use of dataclasses * The base cli classes were refactored to also take into account the need for a connector and write config when creating the respective runner/writer classes. * Any mismatch between the cli field name and the dataclass field name were updated on the dataclass side to not impact the user but maintain consistency * Add custom redaction logic for mongodb URIs since the password is expected to be a part of it. Now this: `"mongodb+srv://ingest-test-user:r4hK3BD07b@ingest-test.hgaig.mongodb.net/"` -> `"mongodb+srv://ingest-test-user:***REDACTED***@ingest-test.hgaig.mongodb.net/"` in the logs * Bundle all fsspec based files into their own packages. * Refactor custom `_decode_dataclass` used for enhanced json mixin by using a monkey-patch approach. The original approach was breaking on optional nested dataclasses when serializing since the other methods in `dataclasses_json_core` weren't using the new method. By monkey-patching the original method with a new one, all other methods in that library would use the new one. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com>
348 lines
12 KiB
Python
348 lines
12 KiB
Python
import os
|
|
import pathlib
|
|
from dataclasses import dataclass
|
|
from typing import Any, Dict
|
|
|
|
import pytest
|
|
|
|
from unstructured.documents.elements import DataSourceMetadata
|
|
from unstructured.ingest.connector.fsspec.sftp import SftpAccessConfig, SimpleSftpConfig
|
|
from unstructured.ingest.interfaces import (
|
|
BaseConnectorConfig,
|
|
BaseSingleIngestDoc,
|
|
FsspecConfig,
|
|
PartitionConfig,
|
|
ProcessorConfig,
|
|
ReadConfig,
|
|
)
|
|
from unstructured.partition.auto import partition
|
|
from unstructured.staging.base import convert_to_dict
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "../..", "example-docs")
|
|
TEST_DOWNLOAD_DIR = "/tmp"
|
|
TEST_OUTPUT_DIR = "/tmp"
|
|
TEST_ID = "test"
|
|
TEST_FILE_PATH = os.path.join(EXAMPLE_DOCS_DIRECTORY, "book-war-and-peace-1p.txt")
|
|
|
|
|
|
@dataclass
|
|
class ExampleConfig(BaseConnectorConfig):
|
|
id: str
|
|
path: str
|
|
|
|
|
|
TEST_CONFIG = ExampleConfig(id=TEST_ID, path=TEST_FILE_PATH)
|
|
TEST_SOURCE_URL = "test-source-url"
|
|
TEST_VERSION = "1.1.1"
|
|
TEST_RECORD_LOCATOR = {"id": "data-source-id"}
|
|
TEST_DATE_CREATED = "2021-01-01T00:00:00"
|
|
TEST_DATE_MODIFIED = "2021-01-02T00:00:00"
|
|
TEST_DATE_PROCESSSED = "2022-12-13T15:44:08"
|
|
|
|
|
|
@dataclass
|
|
class ExampleIngestDoc(BaseSingleIngestDoc):
|
|
connector_config: ExampleConfig
|
|
|
|
@property
|
|
def filename(self):
|
|
return TEST_FILE_PATH
|
|
|
|
@property
|
|
def _output_filename(self):
|
|
return TEST_FILE_PATH + ".json"
|
|
|
|
@property
|
|
def source_url(self) -> str:
|
|
return TEST_SOURCE_URL
|
|
|
|
@property
|
|
def version(self) -> str:
|
|
return TEST_VERSION
|
|
|
|
@property
|
|
def record_locator(self) -> Dict[str, Any]:
|
|
return TEST_RECORD_LOCATOR
|
|
|
|
@property
|
|
def date_created(self) -> str:
|
|
return TEST_DATE_CREATED
|
|
|
|
@property
|
|
def date_modified(self) -> str:
|
|
return TEST_DATE_MODIFIED
|
|
|
|
@property
|
|
def exists(self) -> bool:
|
|
return True
|
|
|
|
def cleanup_file(self):
|
|
pass
|
|
|
|
def get_file(self):
|
|
pass
|
|
|
|
def has_output(self):
|
|
return True
|
|
|
|
def write_result(self, result):
|
|
pass
|
|
|
|
|
|
@pytest.fixture()
|
|
def partition_test_results():
|
|
# Reusable partition test results, calculated only once
|
|
result = partition(
|
|
filename=str(TEST_FILE_PATH),
|
|
data_source_metadata=DataSourceMetadata(
|
|
url=TEST_SOURCE_URL,
|
|
version=TEST_VERSION,
|
|
record_locator=TEST_RECORD_LOCATOR,
|
|
date_created=TEST_DATE_CREATED,
|
|
date_modified=TEST_DATE_MODIFIED,
|
|
date_processed=TEST_DATE_PROCESSSED,
|
|
),
|
|
)
|
|
return result
|
|
|
|
|
|
@pytest.fixture()
|
|
def partition_file_test_results(partition_test_results):
|
|
# Reusable partition_file test results, calculated only once
|
|
return convert_to_dict(partition_test_results)
|
|
|
|
|
|
def test_partition_file():
|
|
"""Validate partition_file returns a list of dictionaries with the expected keys,
|
|
metadatakeys, and data source metadata values."""
|
|
test_ingest_doc = ExampleIngestDoc(
|
|
connector_config=TEST_CONFIG,
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
|
)
|
|
test_ingest_doc._date_processed = TEST_DATE_PROCESSSED
|
|
isd_elems_raw = test_ingest_doc.partition_file(partition_config=PartitionConfig())
|
|
isd_elems = convert_to_dict(isd_elems_raw)
|
|
assert len(isd_elems)
|
|
expected_keys = {
|
|
"element_id",
|
|
"text",
|
|
"type",
|
|
"metadata",
|
|
}
|
|
# The document in TEST_FILE_PATH does not have elements with coordinates so
|
|
# partition is not expected to return coordinates metadata.
|
|
expected_metadata_keys = {
|
|
"data_source",
|
|
"filename",
|
|
"file_directory",
|
|
"filetype",
|
|
"languages",
|
|
"last_modified",
|
|
}
|
|
for elem in isd_elems:
|
|
# Parent IDs are non-deterministic - remove them from the test
|
|
elem["metadata"].pop("parent_id", None)
|
|
|
|
assert expected_keys == set(elem.keys())
|
|
assert expected_metadata_keys == set(elem["metadata"].keys())
|
|
data_source_metadata = elem["metadata"]["data_source"]
|
|
assert data_source_metadata["url"] == TEST_SOURCE_URL
|
|
assert data_source_metadata["version"] == TEST_VERSION
|
|
assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
|
|
assert data_source_metadata["date_created"] == TEST_DATE_CREATED
|
|
assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
|
|
assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSSED
|
|
|
|
|
|
def test_process_file_fields_include_default(mocker, partition_test_results):
|
|
"""Validate when metadata_include and metadata_exclude are not set, all fields:
|
|
("element_id", "text", "type", "metadata") are included"""
|
|
mock_partition = mocker.patch(
|
|
"unstructured.ingest.interfaces.partition",
|
|
return_value=partition_test_results,
|
|
)
|
|
test_ingest_doc = ExampleIngestDoc(
|
|
connector_config=TEST_CONFIG,
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
|
)
|
|
isd_elems_raw = test_ingest_doc.partition_file(partition_config=PartitionConfig())
|
|
isd_elems = convert_to_dict(isd_elems_raw)
|
|
assert len(isd_elems)
|
|
assert mock_partition.call_count == 1
|
|
for elem in isd_elems:
|
|
# Parent IDs are non-deterministic - remove them from the test
|
|
elem["metadata"].pop("parent_id", None)
|
|
|
|
assert {"element_id", "text", "type", "metadata"} == set(elem.keys())
|
|
data_source_metadata = elem["metadata"]["data_source"]
|
|
assert data_source_metadata["url"] == TEST_SOURCE_URL
|
|
assert data_source_metadata["version"] == TEST_VERSION
|
|
assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
|
|
assert data_source_metadata["date_created"] == TEST_DATE_CREATED
|
|
assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
|
|
assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSSED
|
|
|
|
|
|
def test_process_file_metadata_includes_filename_and_filetype(
|
|
mocker,
|
|
partition_test_results,
|
|
):
|
|
"""Validate when metadata_include is set to "filename,filetype",
|
|
only filename is included in metadata"""
|
|
mocker.patch(
|
|
"unstructured.ingest.interfaces.partition",
|
|
return_value=partition_test_results,
|
|
)
|
|
partition_config = PartitionConfig(
|
|
metadata_include=["filename", "filetype"],
|
|
)
|
|
test_ingest_doc = ExampleIngestDoc(
|
|
connector_config=TEST_CONFIG,
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
|
)
|
|
isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
|
|
assert len(isd_elems)
|
|
for elem in isd_elems:
|
|
# Parent IDs are non-deterministic - remove them from the test
|
|
elem["metadata"].pop("parent_id", None)
|
|
|
|
assert set(elem["metadata"].keys()) == {"filename", "filetype"}
|
|
|
|
|
|
def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_results):
|
|
"""Validate when metadata_exclude is set to "filename,page_number",
|
|
neither filename nor page_number are included in metadata"""
|
|
mocker.patch(
|
|
"unstructured.ingest.interfaces.partition",
|
|
return_value=partition_test_results,
|
|
)
|
|
partition_config = PartitionConfig(
|
|
metadata_exclude=["filename", "page_number"],
|
|
)
|
|
test_ingest_doc = ExampleIngestDoc(
|
|
connector_config=TEST_CONFIG,
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
processor_config=ProcessorConfig(
|
|
output_dir=TEST_OUTPUT_DIR,
|
|
),
|
|
)
|
|
isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
|
|
assert len(isd_elems)
|
|
for elem in isd_elems:
|
|
assert "filename" not in elem["metadata"]
|
|
assert "page_number" not in elem["metadata"]
|
|
|
|
|
|
def test_process_file_flatten_metadata(mocker, partition_test_results):
|
|
mocker.patch(
|
|
"unstructured.ingest.interfaces.partition",
|
|
return_value=partition_test_results,
|
|
)
|
|
partition_config = PartitionConfig(
|
|
metadata_include=["filename", "file_directory", "filetype"],
|
|
flatten_metadata=True,
|
|
)
|
|
test_ingest_doc = ExampleIngestDoc(
|
|
connector_config=TEST_CONFIG,
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
processor_config=ProcessorConfig(
|
|
output_dir=TEST_OUTPUT_DIR,
|
|
),
|
|
)
|
|
isd_elems = test_ingest_doc.process_file(partition_config=partition_config)
|
|
expected_keys = {"element_id", "text", "type", "filename", "file_directory", "filetype"}
|
|
for elem in isd_elems:
|
|
assert expected_keys == set(elem.keys())
|
|
|
|
|
|
def test_post_init_invalid_protocol():
|
|
"""Validate that an invalid protocol raises a ValueError"""
|
|
with pytest.raises(ValueError):
|
|
FsspecConfig(remote_url="ftp://example.com/path/to/file.txt")
|
|
|
|
|
|
def test_fsspec_path_extraction_dropbox_root():
|
|
"""Validate that the path extraction works for dropbox root"""
|
|
config = FsspecConfig(remote_url="dropbox:// /")
|
|
assert config.protocol == "dropbox"
|
|
assert config.path_without_protocol == " /"
|
|
assert config.dir_path == " "
|
|
assert config.file_path == ""
|
|
|
|
|
|
def test_fsspec_path_extraction_dropbox_subfolder():
|
|
"""Validate that the path extraction works for dropbox subfolder"""
|
|
config = FsspecConfig(remote_url="dropbox://path")
|
|
assert config.protocol == "dropbox"
|
|
assert config.path_without_protocol == "path"
|
|
assert config.dir_path == "path"
|
|
assert config.file_path == ""
|
|
|
|
|
|
def test_fsspec_path_extraction_s3_bucket_only():
|
|
"""Validate that the path extraction works for s3 bucket without filename"""
|
|
config = FsspecConfig(remote_url="s3://bucket-name")
|
|
assert config.protocol == "s3"
|
|
assert config.path_without_protocol == "bucket-name"
|
|
assert config.dir_path == "bucket-name"
|
|
assert config.file_path == ""
|
|
|
|
|
|
def test_fsspec_path_extraction_s3_valid_path():
|
|
"""Validate that the path extraction works for s3 bucket with filename"""
|
|
config = FsspecConfig(remote_url="s3://bucket-name/path/to/file.txt")
|
|
assert config.protocol == "s3"
|
|
assert config.path_without_protocol == "bucket-name/path/to/file.txt"
|
|
assert config.dir_path == "bucket-name"
|
|
assert config.file_path == "path/to/file.txt"
|
|
|
|
|
|
def test_fsspec_path_extraction_s3_invalid_path():
|
|
"""Validate that an invalid s3 path (that mimics triple slash for dropbox)
|
|
raises a ValueError"""
|
|
with pytest.raises(ValueError):
|
|
FsspecConfig(remote_url="s3:///bucket-name/path/to")
|
|
|
|
|
|
def test_sftp_path_extraction_post_init_with_extension():
|
|
"""Validate that the path extraction works for sftp with file extension"""
|
|
config = SimpleSftpConfig(
|
|
remote_url="sftp://example.com/path/to/file.txt",
|
|
access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
|
|
)
|
|
assert config.file_path == "file.txt"
|
|
assert config.dir_path == "path/to"
|
|
assert config.path_without_protocol == "path/to"
|
|
assert config.access_config.host == "example.com"
|
|
assert config.access_config.port == 22
|
|
|
|
|
|
def test_sftp_path_extraction_without_extension():
|
|
"""Validate that the path extraction works for sftp without extension"""
|
|
config = SimpleSftpConfig(
|
|
remote_url="sftp://example.com/path/to/directory",
|
|
access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
|
|
)
|
|
assert config.file_path == ""
|
|
assert config.dir_path == "path/to/directory"
|
|
assert config.path_without_protocol == "path/to/directory"
|
|
assert config.access_config.host == "example.com"
|
|
assert config.access_config.port == 22
|
|
|
|
|
|
def test_sftp_path_extraction_with_port():
|
|
"""Validate that the path extraction works for sftp with a non-default port"""
|
|
config = SimpleSftpConfig(
|
|
remote_url="sftp://example.com:47474/path/to/file.txt",
|
|
access_config=SftpAccessConfig(username="username", password="password", host="", port=22),
|
|
)
|
|
assert config.file_path == "file.txt"
|
|
assert config.dir_path == "path/to"
|
|
assert config.path_without_protocol == "path/to"
|
|
assert config.access_config.host == "example.com"
|
|
assert config.access_config.port == 47474
|