241 lines
7.7 KiB
Python
Raw Normal View History

2023-06-06 09:03:13 -07:00
import os
import pathlib
from dataclasses import dataclass
from typing import Any, Dict
2023-06-06 09:03:13 -07:00
import pytest
from freezegun import freeze_time
2023-06-06 09:03:13 -07:00
from unstructured.documents.elements import DataSourceMetadata
2023-06-06 09:03:13 -07:00
from unstructured.ingest.interfaces import (
BaseConnectorConfig,
BaseIngestDoc,
StandardConnectorConfig,
)
from unstructured.partition.auto import partition
from unstructured.staging.base import convert_to_dict
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "../..", "example-docs")
TEST_DOWNLOAD_DIR = "/tmp"
TEST_OUTPUT_DIR = "/tmp"
TEST_ID = "test"
TEST_FILE_PATH = os.path.join(EXAMPLE_DOCS_DIRECTORY, "book-war-and-peace-1p.txt")
2023-06-06 09:03:13 -07:00
@dataclass
class TestConfig(BaseConnectorConfig):
id: str
path: str
TEST_CONFIG = TestConfig(id=TEST_ID, path=TEST_FILE_PATH)
TEST_SOURCE_URL = "test-source-url"
TEST_VERSION = "1.1.1"
TEST_RECORD_LOCATOR = {"id": "data-source-id"}
TEST_DATE_CREATED = "2021-01-01T00:00:00"
TEST_DATE_MODIFIED = "2021-01-02T00:00:00"
TEST_DATE_PROCESSSED = "2022-12-13T15:44:08"
2023-06-06 09:03:13 -07:00
2023-06-06 09:03:13 -07:00
@dataclass
class TestIngestDoc(BaseIngestDoc):
config: TestConfig
@property
def filename(self):
return TEST_FILE_PATH
2023-06-26 13:12:05 -07:00
@property
def _output_filename(self):
return TEST_FILE_PATH + ".json"
@property
def source_url(self) -> str:
return TEST_SOURCE_URL
@property
def version(self) -> str:
return TEST_VERSION
@property
def record_locator(self) -> Dict[str, Any]:
return TEST_RECORD_LOCATOR
@property
def date_created(self) -> str:
return TEST_DATE_CREATED
@property
def date_modified(self) -> str:
return TEST_DATE_MODIFIED
@property
def exists(self) -> bool:
return True
2023-06-06 09:03:13 -07:00
def cleanup_file(self):
pass
def get_file(self):
pass
def has_output(self):
return True
2023-06-06 09:03:13 -07:00
def write_result(self, result):
pass
2023-06-06 09:03:13 -07:00
@pytest.fixture()
def partition_test_results():
# Reusable partition test results, calculated only once
result = partition(
filename=str(TEST_FILE_PATH),
data_source_metadata=DataSourceMetadata(
url=TEST_SOURCE_URL,
version=TEST_VERSION,
record_locator=TEST_RECORD_LOCATOR,
date_created=TEST_DATE_CREATED,
date_modified=TEST_DATE_MODIFIED,
date_processed=TEST_DATE_PROCESSSED,
),
)
return result
2023-06-06 09:03:13 -07:00
2023-06-06 09:03:13 -07:00
@pytest.fixture()
def partition_file_test_results(partition_test_results):
# Reusable partition_file test results, calculated only once
return convert_to_dict(partition_test_results)
def test_partition_file():
"""Validate partition_file returns a list of dictionaries with the expected keys,
metadatakeys, and data source metadata values."""
test_ingest_doc = TestIngestDoc(
config=TEST_CONFIG,
standard_config=StandardConnectorConfig(
download_dir=TEST_DOWNLOAD_DIR,
output_dir=TEST_OUTPUT_DIR,
),
)
test_ingest_doc._date_processed = TEST_DATE_PROCESSSED
isd_elems = test_ingest_doc.partition_file()
assert len(isd_elems)
expected_keys = {
"element_id",
"text",
"type",
"metadata",
}
# The document in TEST_FILE_PATH does not have elements with coordinates so
# partition is not expected to return coordinates metadata.
feat: add document date for remaining file types (#930) (#969) * feat: add document date for remaining file types (#930) * feat: add functions for getting modification date * feat: add date field to metadata from csv file * feat: add tests for csv patition * feat: add date field to metadata from html file * feat: add tests for html partition * fix: return file name onlyif possible * feat: add csv tests * fix: renaming * feat: add filed metadata_date as date of last mod * feat: add tests for partition_docx * feat: add filed metadata_date to .doc file * feat: add tests for partition_doc * feat: add metadata_date to .epub file * feat: add tests for partition_epub * fix: fix test mocking * feat: add metadata_date for image partition * feat: add test for image partition * feat: add coorrdinate system argument * feat: add date to element metadata * feat: add metadata_date for JSON partition * feat: add test for JSON partition * fix: rename variable * feat: add metadata_date for md partition * feat: add test for md partition * feat: update doc string * feat: add metadata_date for .odt partition * feat: update .odt string * feat: add metadata_date for .org partition * feat: add tests for .org partition * feat: add metadata_date for .pdf partition * feat: add tests for .pdf partition * feat: add metadata_date for .pptx partition * feat: add metadata_date for .ppt partition * feat: add tests for .ppt partition * feat: add tests for .pptx partition * feat: add metadata_date for .rst partition * feat: add tests for .rst partition * fix: get modification date after file checking * feat: add tests for .rtf partition * feat: add tests for .rtf partition * feat: add metadata_date for .txt partition * fix: rename argument * feat: add tests for .txt partition * feat: update doc string rst patrition function * feat: add metadata_date for .tsv partition * feat: add tests for .tsv partition * feat: add metadata_date for .xlsx partition * feat: add tests for .xlsx partition * fix: clean up * feat: add tests for .xml partition * feat: add tests for .xml partition * fix: use `or ` instead of `if` * fix: fix epub tests * fix: remove not used code * fix: add try block for getting file name * fix: applying linter changes * fix: fix test_partition_file * feat: add metadata_date for email * feat: add test for email partition * feat: add metadata_date for msg * feat: add tests for msg partition * feat: update CHANGELOG file * fix: update partitions doc string * don't push * fix: clean up code * linting, linting, linting * remove unnecessary example doc * update version and changelog * ingest-test-fixtures-update * set metadata date in test --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io> * ingest-test-fixtures-update * Update ingest test fixtures (#970) Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com> * Revert "Update ingest test fixtures (#970)" This reverts commit 1d182ae474b3545b15551fffc15977757d552cd2. * remove date from metadata in outputs * update docstring ordering * remove print * remove print * remove print * linting, linting, linting * fix version and test * fix changelog * fix changelog * update version --------- Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
2023-07-26 15:10:14 -04:00
expected_metadata_keys = {"data_source", "filename", "file_directory", "filetype", "date"}
for elem in isd_elems:
assert expected_keys == set(elem.keys())
assert expected_metadata_keys == set(elem["metadata"].keys())
data_source_metadata = elem["metadata"]["data_source"]
assert data_source_metadata["url"] == TEST_SOURCE_URL
assert data_source_metadata["version"] == TEST_VERSION
assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
assert data_source_metadata["date_created"] == TEST_DATE_CREATED
assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSSED
@freeze_time(TEST_DATE_PROCESSSED)
2023-06-06 09:03:13 -07:00
def test_process_file_fields_include_default(mocker, partition_test_results):
"""Validate when metadata_include and metadata_exclude are not set, all fields:
("element_id", "text", "type", "metadata") are included"""
mock_partition = mocker.patch(
2023-06-06 09:03:13 -07:00
"unstructured.ingest.interfaces.partition",
return_value=partition_test_results,
)
test_ingest_doc = TestIngestDoc(
config=TEST_CONFIG,
standard_config=StandardConnectorConfig(
download_dir=TEST_DOWNLOAD_DIR,
output_dir=TEST_OUTPUT_DIR,
),
)
isd_elems = test_ingest_doc.process_file()
assert len(isd_elems)
assert mock_partition.call_count == 1
assert (
mock_partition.call_args.kwargs["data_source_metadata"].date_processed
== TEST_DATE_PROCESSSED
)
2023-06-06 09:03:13 -07:00
for elem in isd_elems:
assert {"element_id", "text", "type", "metadata"} == set(elem.keys())
data_source_metadata = elem["metadata"]["data_source"]
assert data_source_metadata["url"] == TEST_SOURCE_URL
assert data_source_metadata["version"] == TEST_VERSION
assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
assert data_source_metadata["date_created"] == TEST_DATE_CREATED
assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSSED
2023-06-06 09:03:13 -07:00
def test_process_file_metadata_includes_filename_and_filetype(mocker, partition_test_results):
"""Validate when metadata_include is set to "filename,filetype",
2023-06-06 09:03:13 -07:00
only filename is included in metadata"""
mocker.patch(
"unstructured.ingest.interfaces.partition",
return_value=partition_test_results,
)
test_ingest_doc = TestIngestDoc(
config=TEST_CONFIG,
standard_config=StandardConnectorConfig(
download_dir=TEST_DOWNLOAD_DIR,
output_dir=TEST_OUTPUT_DIR,
metadata_include="filename,filetype",
2023-06-06 09:03:13 -07:00
),
)
isd_elems = test_ingest_doc.process_file()
assert len(isd_elems)
for elem in isd_elems:
assert set(elem["metadata"].keys()) == {"filename", "filetype"}
2023-06-06 09:03:13 -07:00
2023-06-06 09:03:13 -07:00
def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_results):
"""Validate when metadata_exclude is set to "filename,page_number",
neither filename nor page_number are included in metadata"""
mocker.patch(
"unstructured.ingest.interfaces.partition",
return_value=partition_test_results,
)
test_ingest_doc = TestIngestDoc(
config=TEST_CONFIG,
standard_config=StandardConnectorConfig(
download_dir=TEST_DOWNLOAD_DIR,
output_dir=TEST_OUTPUT_DIR,
metadata_exclude="filename,page_number",
),
)
isd_elems = test_ingest_doc.process_file()
assert len(isd_elems)
for elem in isd_elems:
assert "filename" not in elem["metadata"].keys()
assert "page_number" not in elem["metadata"].keys()
2023-06-06 09:03:13 -07:00
def test_process_file_flatten_metadata(mocker, partition_test_results):
mocker.patch(
"unstructured.ingest.interfaces.partition",
return_value=partition_test_results,
)
test_ingest_doc = TestIngestDoc(
config=TEST_CONFIG,
standard_config=StandardConnectorConfig(
download_dir=TEST_DOWNLOAD_DIR,
output_dir=TEST_OUTPUT_DIR,
metadata_include="filename,data_source",
2023-06-06 09:03:13 -07:00
flatten_metadata=True,
),
)
isd_elems = test_ingest_doc.process_file()
expected_keys = {"element_id", "text", "type", "filename", "data_source"}
2023-06-06 09:03:13 -07:00
for elem in isd_elems:
assert expected_keys == set(elem.keys())