2023-03-28 16:15:22 -04:00
|
|
|
import os
|
|
|
|
import pathlib
|
|
|
|
|
|
|
|
import msg_parser
|
|
|
|
import pytest
|
|
|
|
|
2023-09-11 16:00:14 -05:00
|
|
|
from unstructured.chunking.title import chunk_by_title
|
2023-04-04 14:23:41 -04:00
|
|
|
from unstructured.documents.elements import (
|
|
|
|
ElementMetadata,
|
|
|
|
ListItem,
|
|
|
|
NarrativeText,
|
|
|
|
Title,
|
|
|
|
)
|
2023-08-29 16:59:26 -04:00
|
|
|
from unstructured.partition.json import partition_json
|
2023-04-21 11:14:46 -05:00
|
|
|
from unstructured.partition.msg import extract_msg_attachment_info, partition_msg
|
2023-06-29 18:01:12 -04:00
|
|
|
from unstructured.partition.text import partition_text
|
2023-10-05 15:26:47 -05:00
|
|
|
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
2023-08-29 16:59:26 -04:00
|
|
|
from unstructured.staging.base import elements_to_json
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
2023-08-19 12:56:13 -04:00
|
|
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
EXPECTED_MSG_OUTPUT = [
|
|
|
|
NarrativeText(text="This is a test email to use for unit tests."),
|
|
|
|
Title(text="Important points:"),
|
|
|
|
ListItem(text="Roses are red"),
|
|
|
|
ListItem(text="Violets are blue"),
|
|
|
|
]
|
|
|
|
|
2023-04-21 11:14:46 -05:00
|
|
|
ATTACH_EXPECTED_OUTPUT = [
|
|
|
|
{
|
|
|
|
"filename": "fake-attachment.txt",
|
|
|
|
"extension": ".txt",
|
|
|
|
"file_size": "unknown",
|
|
|
|
"payload": b"Hey this is a fake attachment!",
|
|
|
|
},
|
|
|
|
]
|
|
|
|
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
def test_partition_msg_from_filename():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
|
|
elements = partition_msg(filename=filename)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
parent_id = elements[0].metadata.parent_id
|
|
|
|
|
2023-03-28 16:15:22 -04:00
|
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
2023-06-16 10:10:56 -04:00
|
|
|
assert (
|
|
|
|
elements[0].metadata.to_dict()
|
|
|
|
== ElementMetadata(
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinates=None,
|
2023-06-16 10:10:56 -04:00
|
|
|
filename=filename,
|
2023-07-31 19:55:43 -07:00
|
|
|
last_modified="2022-12-16T17:04:16-05:00",
|
2023-06-16 10:10:56 -04:00
|
|
|
page_number=None,
|
|
|
|
url=None,
|
|
|
|
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
|
|
|
|
sent_to=["Matthew Robinson (None)"],
|
|
|
|
subject="Test Email",
|
|
|
|
filetype="application/vnd.ms-outlook",
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
parent_id=parent_id,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=["eng"],
|
2023-06-16 10:10:56 -04:00
|
|
|
).to_dict()
|
2023-04-04 14:23:41 -04:00
|
|
|
)
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "fake-email.msg"
|
2023-10-05 15:26:47 -05:00
|
|
|
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
|
|
|
assert {element.metadata.detection_origin for element in elements} == {"msg"}
|
2023-07-05 15:02:22 -05:00
|
|
|
|
|
|
|
|
2023-09-15 09:51:22 -07:00
|
|
|
def test_partition_msg_from_filename_returns_uns_elements():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
|
|
elements = partition_msg(filename=filename)
|
|
|
|
assert isinstance(elements[0], NarrativeText)
|
|
|
|
|
|
|
|
|
2023-07-05 15:02:22 -05:00
|
|
|
def test_partition_msg_from_filename_with_metadata_filename():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
|
|
elements = partition_msg(filename=filename, metadata_filename="test")
|
|
|
|
assert all(element.metadata.filename == "test" for element in elements)
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
|
|
|
|
class MockMsOxMessage:
|
|
|
|
def __init__(self, filename):
|
|
|
|
self.body = "Here is an email with plain text."
|
2023-08-25 20:09:25 -04:00
|
|
|
self.header_dict = {"Content-Type": "text/plain"}
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_from_filename_with_text_content(monkeypatch):
|
|
|
|
monkeypatch.setattr(msg_parser, "MsOxMessage", MockMsOxMessage)
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
|
|
elements = partition_msg(filename=filename)
|
|
|
|
assert str(elements[0]) == "Here is an email with plain text."
|
2023-05-15 18:25:39 -04:00
|
|
|
assert elements[0].metadata.filename == "fake-email.msg"
|
|
|
|
assert elements[0].metadata.file_directory == EXAMPLE_DOCS_DIRECTORY
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_raises_with_missing_file():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "doesnt-exist.msg")
|
|
|
|
with pytest.raises(FileNotFoundError):
|
|
|
|
partition_msg(filename=filename)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_from_file():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = partition_msg(file=f)
|
|
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_from_file_with_metadata_filename():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = partition_msg(file=f, metadata_filename="test")
|
|
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "test"
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
|
2023-04-21 11:14:46 -05:00
|
|
|
def test_extract_attachment_info():
|
2023-07-26 15:10:14 -04:00
|
|
|
filename = os.path.join(
|
|
|
|
DIRECTORY,
|
|
|
|
"..",
|
|
|
|
"..",
|
2023-08-19 12:56:13 -04:00
|
|
|
"..",
|
2023-07-26 15:10:14 -04:00
|
|
|
"example-docs",
|
|
|
|
"fake-email-attachment.msg",
|
|
|
|
)
|
2023-04-21 11:14:46 -05:00
|
|
|
attachment_info = extract_msg_attachment_info(filename)
|
|
|
|
assert len(attachment_info) > 0
|
|
|
|
assert attachment_info == ATTACH_EXPECTED_OUTPUT
|
|
|
|
|
|
|
|
|
2023-03-28 16:15:22 -04:00
|
|
|
def test_partition_msg_raises_with_both_specified():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
|
|
with open(filename, "rb") as f, pytest.raises(ValueError):
|
|
|
|
partition_msg(filename=filename, file=f)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_raises_with_neither():
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_msg()
|
2023-06-29 18:01:12 -04:00
|
|
|
|
|
|
|
|
2023-06-30 09:44:46 -05:00
|
|
|
def test_partition_msg_from_filename_exclude_metadata():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
|
|
elements = partition_msg(filename=filename, include_metadata=False)
|
|
|
|
|
|
|
|
for i in range(len(elements)):
|
|
|
|
assert elements[i].metadata.to_dict() == {}
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_from_file_exclude_metadata():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = partition_msg(file=f, include_metadata=False)
|
|
|
|
|
|
|
|
for i in range(len(elements)):
|
|
|
|
assert elements[i].metadata.to_dict() == {}
|
|
|
|
|
|
|
|
|
2023-06-29 18:01:12 -04:00
|
|
|
def test_partition_msg_can_process_attachments(
|
|
|
|
tmpdir,
|
|
|
|
filename="example-docs/fake-email-attachment.msg",
|
|
|
|
):
|
|
|
|
extract_msg_attachment_info(filename=filename, output_dir=tmpdir.dirname)
|
2023-07-26 15:10:14 -04:00
|
|
|
attachment_filename = os.path.join(
|
|
|
|
tmpdir.dirname,
|
|
|
|
ATTACH_EXPECTED_OUTPUT[0]["filename"],
|
|
|
|
)
|
2023-08-04 18:28:36 +03:00
|
|
|
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
|
2023-06-29 18:01:12 -04:00
|
|
|
attachment_elements = partition_text(
|
|
|
|
filename=attachment_filename,
|
|
|
|
metadata_filename=attachment_filename,
|
2023-08-18 18:21:11 -05:00
|
|
|
metadata_last_modified=mocked_last_modification_date,
|
2023-06-29 18:01:12 -04:00
|
|
|
)
|
|
|
|
expected_metadata = attachment_elements[0].metadata
|
|
|
|
expected_metadata.file_directory = None
|
|
|
|
expected_metadata.attached_to_filename = filename
|
|
|
|
|
|
|
|
elements = partition_msg(
|
|
|
|
filename=filename,
|
|
|
|
attachment_partitioner=partition_text,
|
|
|
|
process_attachments=True,
|
2023-08-04 18:28:36 +03:00
|
|
|
metadata_last_modified=mocked_last_modification_date,
|
2023-06-29 18:01:12 -04:00
|
|
|
)
|
|
|
|
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
# This test does not need to validate if hierarchy is working
|
|
|
|
# Patch to nullify parent_id
|
|
|
|
expected_metadata.parent_id = None
|
|
|
|
elements[-1].metadata.parent_id = None
|
|
|
|
|
2023-06-29 18:01:12 -04:00
|
|
|
assert elements[0].text.startswith("Hello!")
|
|
|
|
for element in elements[:-1]:
|
|
|
|
assert element.metadata.filename == "fake-email-attachment.msg"
|
|
|
|
assert element.metadata.subject == "Fake email with attachment"
|
|
|
|
assert elements[-1].text == "Hey this is a fake attachment!"
|
|
|
|
assert elements[-1].metadata == expected_metadata
|
|
|
|
|
|
|
|
|
2023-08-18 18:21:11 -05:00
|
|
|
def test_partition_msg_can_process_min_max_wtih_attachments(
|
|
|
|
tmpdir,
|
|
|
|
filename="example-docs/fake-email-attachment.msg",
|
|
|
|
):
|
|
|
|
extract_msg_attachment_info(filename=filename, output_dir=tmpdir.dirname)
|
|
|
|
attachment_filename = os.path.join(
|
|
|
|
tmpdir.dirname,
|
|
|
|
ATTACH_EXPECTED_OUTPUT[0]["filename"],
|
|
|
|
)
|
|
|
|
|
|
|
|
attachment_elements = partition_text(
|
|
|
|
filename=attachment_filename,
|
|
|
|
metadata_filename=attachment_filename,
|
|
|
|
min_partition=6,
|
|
|
|
max_partition=12,
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_msg(
|
|
|
|
filename=filename,
|
|
|
|
attachment_partitioner=partition_text,
|
|
|
|
process_attachments=True,
|
|
|
|
min_partition=6,
|
|
|
|
max_partition=12,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert elements[0].text.startswith("Hello!")
|
|
|
|
assert elements[-1].text == attachment_elements[-1].text
|
|
|
|
assert elements[-2].text == attachment_elements[-2].text
|
|
|
|
for element in elements:
|
|
|
|
if element.metadata.attached_to_filename is not None:
|
|
|
|
assert len(element.text) <= 12
|
|
|
|
assert len(element.text) >= 6
|
|
|
|
|
|
|
|
|
2023-06-29 18:01:12 -04:00
|
|
|
def test_partition_msg_raises_with_no_partitioner(
|
|
|
|
filename="example-docs/fake-email-attachment.msg",
|
|
|
|
):
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_msg(filename=filename, process_attachments=True)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_from_file_custom_metadata_date(
|
|
|
|
filename="example-docs/fake-email.msg",
|
|
|
|
):
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
elements = partition_msg(
|
|
|
|
file=f,
|
|
|
|
metadata_last_modified=expected_last_modification_date,
|
|
|
|
)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_custom_metadata_date(
|
|
|
|
filename="example-docs/fake-email.msg",
|
|
|
|
):
|
|
|
|
expected_last_modification_date = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
elements = partition_msg(
|
|
|
|
filename=filename,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-08-25 20:09:25 -04:00
|
|
|
|
|
|
|
|
2023-08-29 16:59:26 -04:00
|
|
|
def test_partition_msg_with_json():
|
|
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
|
|
elements = partition_msg(filename=filename)
|
|
|
|
test_elements = partition_json(text=elements_to_json(elements))
|
|
|
|
|
|
|
|
assert elements == test_elements
|
|
|
|
assert elements[0].metadata.sent_from == test_elements[0].metadata.sent_from
|
|
|
|
assert elements[0].metadata.sent_to[0] == test_elements[0].metadata.sent_to[0]
|
|
|
|
assert elements[0].metadata.subject == test_elements[0].metadata.subject
|
|
|
|
|
|
|
|
|
2023-08-25 20:09:25 -04:00
|
|
|
def test_partition_msg_with_pgp_encrypted_message(
|
|
|
|
caplog,
|
|
|
|
filename="example-docs/fake-encrypted.msg",
|
|
|
|
):
|
|
|
|
elements = partition_msg(filename=filename)
|
|
|
|
|
|
|
|
assert elements == []
|
|
|
|
assert "WARNING" in caplog.text
|
|
|
|
assert "Encrypted email detected" in caplog.text
|
2023-09-11 16:00:14 -05:00
|
|
|
|
|
|
|
|
2023-10-03 09:40:34 -07:00
|
|
|
def test_add_chunking_strategy_by_title_on_partition_msg(
|
2023-09-11 16:00:14 -05:00
|
|
|
filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg"),
|
|
|
|
):
|
|
|
|
elements = partition_msg(filename=filename)
|
|
|
|
chunk_elements = partition_msg(filename, chunking_strategy="by_title")
|
|
|
|
chunks = chunk_by_title(elements)
|
|
|
|
assert chunk_elements != elements
|
|
|
|
assert chunk_elements == chunks
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_element_metadata_has_languages():
|
|
|
|
filename = "example-docs/fake-email.msg"
|
|
|
|
elements = partition_msg(filename=filename)
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_respects_languages_arg():
|
|
|
|
filename = "example-docs/fake-email.msg"
|
|
|
|
elements = partition_msg(filename=filename, languages=["deu"])
|
|
|
|
assert all(element.metadata.languages == ["deu"] for element in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_raises_TypeError_for_invalid_languages():
|
|
|
|
with pytest.raises(TypeError):
|
|
|
|
filename = "example-docs/fake-email.msg"
|
|
|
|
partition_msg(filename=filename, languages="eng")
|