2023-06-06 09:03:13 -07:00
|
|
|
import os
|
|
|
|
import pathlib
|
|
|
|
from dataclasses import dataclass
|
2023-06-07 21:22:18 -07:00
|
|
|
from typing import Any, Dict
|
2023-06-06 09:03:13 -07:00
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
2023-06-07 21:22:18 -07:00
|
|
|
from unstructured.documents.elements import DataSourceMetadata
|
2023-06-06 09:03:13 -07:00
|
|
|
from unstructured.ingest.interfaces import (
|
|
|
|
BaseConnectorConfig,
|
|
|
|
BaseIngestDoc,
|
2023-09-11 11:40:56 -04:00
|
|
|
PartitionConfig,
|
|
|
|
ReadConfig,
|
2023-06-06 09:03:13 -07:00
|
|
|
)
|
|
|
|
from unstructured.partition.auto import partition
|
|
|
|
from unstructured.staging.base import convert_to_dict
|
|
|
|
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
|
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "../..", "example-docs")
|
2023-06-20 11:19:55 -05:00
|
|
|
TEST_DOWNLOAD_DIR = "/tmp"
|
|
|
|
TEST_OUTPUT_DIR = "/tmp"
|
|
|
|
TEST_ID = "test"
|
|
|
|
TEST_FILE_PATH = os.path.join(EXAMPLE_DOCS_DIRECTORY, "book-war-and-peace-1p.txt")
|
|
|
|
|
2023-06-06 09:03:13 -07:00
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class TestConfig(BaseConnectorConfig):
|
|
|
|
id: str
|
|
|
|
path: str
|
|
|
|
|
2023-06-20 11:19:55 -05:00
|
|
|
|
2023-06-07 21:22:18 -07:00
|
|
|
TEST_CONFIG = TestConfig(id=TEST_ID, path=TEST_FILE_PATH)
|
|
|
|
TEST_SOURCE_URL = "test-source-url"
|
|
|
|
TEST_VERSION = "1.1.1"
|
|
|
|
TEST_RECORD_LOCATOR = {"id": "data-source-id"}
|
|
|
|
TEST_DATE_CREATED = "2021-01-01T00:00:00"
|
|
|
|
TEST_DATE_MODIFIED = "2021-01-02T00:00:00"
|
|
|
|
TEST_DATE_PROCESSSED = "2022-12-13T15:44:08"
|
2023-06-06 09:03:13 -07:00
|
|
|
|
2023-06-20 11:19:55 -05:00
|
|
|
|
2023-06-06 09:03:13 -07:00
|
|
|
@dataclass
|
|
|
|
class TestIngestDoc(BaseIngestDoc):
|
2023-09-11 11:40:56 -04:00
|
|
|
connector_config: TestConfig
|
2023-06-06 09:03:13 -07:00
|
|
|
|
|
|
|
@property
|
|
|
|
def filename(self):
|
2023-06-07 21:22:18 -07:00
|
|
|
return TEST_FILE_PATH
|
2023-06-15 12:21:17 -04:00
|
|
|
|
2023-06-26 13:12:05 -07:00
|
|
|
@property
|
|
|
|
def _output_filename(self):
|
|
|
|
return TEST_FILE_PATH + ".json"
|
2023-07-05 11:25:11 -07:00
|
|
|
|
2023-06-07 21:22:18 -07:00
|
|
|
@property
|
|
|
|
def source_url(self) -> str:
|
|
|
|
return TEST_SOURCE_URL
|
2023-06-15 12:21:17 -04:00
|
|
|
|
|
|
|
@property
|
2023-06-07 21:22:18 -07:00
|
|
|
def version(self) -> str:
|
|
|
|
return TEST_VERSION
|
2023-06-15 12:21:17 -04:00
|
|
|
|
|
|
|
@property
|
2023-06-07 21:22:18 -07:00
|
|
|
def record_locator(self) -> Dict[str, Any]:
|
|
|
|
return TEST_RECORD_LOCATOR
|
|
|
|
|
2023-06-15 12:21:17 -04:00
|
|
|
@property
|
2023-06-07 21:22:18 -07:00
|
|
|
def date_created(self) -> str:
|
|
|
|
return TEST_DATE_CREATED
|
2023-06-15 12:21:17 -04:00
|
|
|
|
|
|
|
@property
|
2023-06-07 21:22:18 -07:00
|
|
|
def date_modified(self) -> str:
|
|
|
|
return TEST_DATE_MODIFIED
|
2023-06-15 12:21:17 -04:00
|
|
|
|
|
|
|
@property
|
2023-06-07 21:22:18 -07:00
|
|
|
def exists(self) -> bool:
|
|
|
|
return True
|
2023-06-15 12:21:17 -04:00
|
|
|
|
2023-06-06 09:03:13 -07:00
|
|
|
def cleanup_file(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def get_file(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def has_output(self):
|
|
|
|
return True
|
2023-06-15 12:21:17 -04:00
|
|
|
|
2023-06-06 09:03:13 -07:00
|
|
|
def write_result(self, result):
|
|
|
|
pass
|
|
|
|
|
2023-06-20 11:19:55 -05:00
|
|
|
|
2023-06-06 09:03:13 -07:00
|
|
|
@pytest.fixture()
|
|
|
|
def partition_test_results():
|
|
|
|
# Reusable partition test results, calculated only once
|
2023-06-07 21:22:18 -07:00
|
|
|
result = partition(
|
|
|
|
filename=str(TEST_FILE_PATH),
|
|
|
|
data_source_metadata=DataSourceMetadata(
|
|
|
|
url=TEST_SOURCE_URL,
|
|
|
|
version=TEST_VERSION,
|
|
|
|
record_locator=TEST_RECORD_LOCATOR,
|
|
|
|
date_created=TEST_DATE_CREATED,
|
|
|
|
date_modified=TEST_DATE_MODIFIED,
|
|
|
|
date_processed=TEST_DATE_PROCESSSED,
|
|
|
|
),
|
|
|
|
)
|
|
|
|
return result
|
2023-06-06 09:03:13 -07:00
|
|
|
|
2023-06-20 11:19:55 -05:00
|
|
|
|
2023-06-06 09:03:13 -07:00
|
|
|
@pytest.fixture()
|
|
|
|
def partition_file_test_results(partition_test_results):
|
|
|
|
# Reusable partition_file test results, calculated only once
|
|
|
|
return convert_to_dict(partition_test_results)
|
|
|
|
|
2023-06-20 11:19:55 -05:00
|
|
|
|
2023-06-07 21:22:18 -07:00
|
|
|
def test_partition_file():
|
|
|
|
"""Validate partition_file returns a list of dictionaries with the expected keys,
|
|
|
|
metadatakeys, and data source metadata values."""
|
|
|
|
test_ingest_doc = TestIngestDoc(
|
2023-09-11 11:40:56 -04:00
|
|
|
connector_config=TEST_CONFIG,
|
|
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
|
|
partition_config=PartitionConfig(output_dir=TEST_OUTPUT_DIR),
|
2023-06-07 21:22:18 -07:00
|
|
|
)
|
|
|
|
test_ingest_doc._date_processed = TEST_DATE_PROCESSSED
|
|
|
|
isd_elems = test_ingest_doc.partition_file()
|
|
|
|
assert len(isd_elems)
|
2023-06-20 11:19:55 -05:00
|
|
|
expected_keys = {
|
|
|
|
"element_id",
|
|
|
|
"text",
|
|
|
|
"type",
|
|
|
|
"metadata",
|
|
|
|
}
|
2023-07-05 11:25:11 -07:00
|
|
|
# The document in TEST_FILE_PATH does not have elements with coordinates so
|
|
|
|
# partition is not expected to return coordinates metadata.
|
2023-09-11 11:40:56 -04:00
|
|
|
expected_metadata_keys = {
|
|
|
|
"data_source",
|
|
|
|
"filename",
|
|
|
|
"file_directory",
|
|
|
|
"filetype",
|
|
|
|
"last_modified",
|
|
|
|
}
|
2023-06-07 21:22:18 -07:00
|
|
|
for elem in isd_elems:
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
# Parent IDs are non-deterministic - remove them from the test
|
|
|
|
elem["metadata"].pop("parent_id", None)
|
|
|
|
|
2023-06-07 21:22:18 -07:00
|
|
|
assert expected_keys == set(elem.keys())
|
|
|
|
assert expected_metadata_keys == set(elem["metadata"].keys())
|
|
|
|
data_source_metadata = elem["metadata"]["data_source"]
|
|
|
|
assert data_source_metadata["url"] == TEST_SOURCE_URL
|
|
|
|
assert data_source_metadata["version"] == TEST_VERSION
|
|
|
|
assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
|
|
|
|
assert data_source_metadata["date_created"] == TEST_DATE_CREATED
|
|
|
|
assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
|
|
|
|
assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSSED
|
|
|
|
|
2023-06-20 11:19:55 -05:00
|
|
|
|
2023-06-06 09:03:13 -07:00
|
|
|
def test_process_file_fields_include_default(mocker, partition_test_results):
|
|
|
|
"""Validate when metadata_include and metadata_exclude are not set, all fields:
|
|
|
|
("element_id", "text", "type", "metadata") are included"""
|
2023-06-07 21:22:18 -07:00
|
|
|
mock_partition = mocker.patch(
|
2023-06-06 09:03:13 -07:00
|
|
|
"unstructured.ingest.interfaces.partition",
|
|
|
|
return_value=partition_test_results,
|
|
|
|
)
|
|
|
|
test_ingest_doc = TestIngestDoc(
|
2023-09-11 11:40:56 -04:00
|
|
|
connector_config=TEST_CONFIG,
|
|
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
|
|
partition_config=PartitionConfig(output_dir=TEST_OUTPUT_DIR),
|
2023-06-06 09:03:13 -07:00
|
|
|
)
|
|
|
|
isd_elems = test_ingest_doc.process_file()
|
|
|
|
assert len(isd_elems)
|
2023-06-07 21:22:18 -07:00
|
|
|
assert mock_partition.call_count == 1
|
2023-06-06 09:03:13 -07:00
|
|
|
for elem in isd_elems:
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
# Parent IDs are non-deterministic - remove them from the test
|
|
|
|
elem["metadata"].pop("parent_id", None)
|
|
|
|
|
2023-06-06 09:03:13 -07:00
|
|
|
assert {"element_id", "text", "type", "metadata"} == set(elem.keys())
|
2023-06-07 21:22:18 -07:00
|
|
|
data_source_metadata = elem["metadata"]["data_source"]
|
|
|
|
assert data_source_metadata["url"] == TEST_SOURCE_URL
|
|
|
|
assert data_source_metadata["version"] == TEST_VERSION
|
|
|
|
assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR
|
|
|
|
assert data_source_metadata["date_created"] == TEST_DATE_CREATED
|
|
|
|
assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED
|
|
|
|
assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSSED
|
2023-06-06 09:03:13 -07:00
|
|
|
|
|
|
|
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
def test_process_file_metadata_includes_filename_and_filetype(
|
|
|
|
mocker,
|
|
|
|
partition_test_results,
|
|
|
|
):
|
2023-06-15 12:21:17 -04:00
|
|
|
"""Validate when metadata_include is set to "filename,filetype",
|
2023-06-06 09:03:13 -07:00
|
|
|
only filename is included in metadata"""
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.ingest.interfaces.partition",
|
|
|
|
return_value=partition_test_results,
|
|
|
|
)
|
|
|
|
test_ingest_doc = TestIngestDoc(
|
2023-09-11 11:40:56 -04:00
|
|
|
connector_config=TEST_CONFIG,
|
|
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
|
|
partition_config=PartitionConfig(
|
2023-06-06 09:03:13 -07:00
|
|
|
output_dir=TEST_OUTPUT_DIR,
|
2023-09-11 11:40:56 -04:00
|
|
|
metadata_include=["filename", "filetype"],
|
2023-06-06 09:03:13 -07:00
|
|
|
),
|
|
|
|
)
|
|
|
|
isd_elems = test_ingest_doc.process_file()
|
|
|
|
assert len(isd_elems)
|
|
|
|
for elem in isd_elems:
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
# Parent IDs are non-deterministic - remove them from the test
|
|
|
|
elem["metadata"].pop("parent_id", None)
|
|
|
|
|
2023-06-15 12:21:17 -04:00
|
|
|
assert set(elem["metadata"].keys()) == {"filename", "filetype"}
|
2023-06-06 09:03:13 -07:00
|
|
|
|
2023-06-20 11:19:55 -05:00
|
|
|
|
2023-06-06 09:03:13 -07:00
|
|
|
def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_results):
|
|
|
|
"""Validate when metadata_exclude is set to "filename,page_number",
|
|
|
|
neither filename nor page_number are included in metadata"""
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.ingest.interfaces.partition",
|
|
|
|
return_value=partition_test_results,
|
|
|
|
)
|
|
|
|
test_ingest_doc = TestIngestDoc(
|
2023-09-11 11:40:56 -04:00
|
|
|
connector_config=TEST_CONFIG,
|
|
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
|
|
partition_config=PartitionConfig(
|
2023-06-06 09:03:13 -07:00
|
|
|
output_dir=TEST_OUTPUT_DIR,
|
2023-09-11 11:40:56 -04:00
|
|
|
metadata_exclude=["filename", "page_number"],
|
2023-06-06 09:03:13 -07:00
|
|
|
),
|
|
|
|
)
|
|
|
|
isd_elems = test_ingest_doc.process_file()
|
|
|
|
assert len(isd_elems)
|
|
|
|
for elem in isd_elems:
|
2023-08-01 11:31:13 -04:00
|
|
|
assert "filename" not in elem["metadata"]
|
|
|
|
assert "page_number" not in elem["metadata"]
|
2023-06-06 09:03:13 -07:00
|
|
|
|
2023-06-20 11:19:55 -05:00
|
|
|
|
2023-06-06 09:03:13 -07:00
|
|
|
def test_process_file_flatten_metadata(mocker, partition_test_results):
|
|
|
|
mocker.patch(
|
|
|
|
"unstructured.ingest.interfaces.partition",
|
|
|
|
return_value=partition_test_results,
|
|
|
|
)
|
|
|
|
test_ingest_doc = TestIngestDoc(
|
2023-09-11 11:40:56 -04:00
|
|
|
connector_config=TEST_CONFIG,
|
|
|
|
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
|
|
|
partition_config=PartitionConfig(
|
2023-06-06 09:03:13 -07:00
|
|
|
output_dir=TEST_OUTPUT_DIR,
|
2023-09-11 11:40:56 -04:00
|
|
|
metadata_include=["filename", "data_source"],
|
2023-06-06 09:03:13 -07:00
|
|
|
flatten_metadata=True,
|
|
|
|
),
|
|
|
|
)
|
|
|
|
isd_elems = test_ingest_doc.process_file()
|
2023-06-15 12:21:17 -04:00
|
|
|
expected_keys = {"element_id", "text", "type", "filename", "data_source"}
|
2023-06-06 09:03:13 -07:00
|
|
|
for elem in isd_elems:
|
2023-06-07 21:22:18 -07:00
|
|
|
assert expected_keys == set(elem.keys())
|