2024-05-29 14:36:05 -07:00
|
|
|
"""Test suite for `unstructured.partition.msg` module."""
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
2024-06-05 14:12:27 -07:00
|
|
|
import io
|
|
|
|
from typing import Any
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
import pytest
|
2024-06-05 14:12:27 -07:00
|
|
|
from oxmsg import Message
|
2023-03-28 16:15:22 -04:00
|
|
|
|
2024-05-29 14:36:05 -07:00
|
|
|
from test_unstructured.unit_utils import (
|
2024-06-05 14:12:27 -07:00
|
|
|
FixtureRequest,
|
2024-05-29 14:36:05 -07:00
|
|
|
LogCaptureFixture,
|
2024-06-05 14:12:27 -07:00
|
|
|
Mock,
|
2024-05-29 14:36:05 -07:00
|
|
|
assert_round_trips_through_JSON,
|
|
|
|
example_doc_path,
|
2024-10-15 19:02:33 -07:00
|
|
|
function_mock,
|
2024-06-05 14:12:27 -07:00
|
|
|
property_mock,
|
2024-05-29 14:36:05 -07:00
|
|
|
)
|
2023-09-11 16:00:14 -05:00
|
|
|
from unstructured.chunking.title import chunk_by_title
|
2023-04-04 14:23:41 -04:00
|
|
|
from unstructured.documents.elements import (
|
|
|
|
ElementMetadata,
|
|
|
|
ListItem,
|
|
|
|
NarrativeText,
|
2024-10-15 19:02:33 -07:00
|
|
|
Text,
|
2023-04-04 14:23:41 -04:00
|
|
|
)
|
2024-10-15 19:02:33 -07:00
|
|
|
from unstructured.partition.common import UnsupportedFileFormatError
|
2024-06-06 01:31:56 -07:00
|
|
|
from unstructured.partition.msg import MsgPartitionerOptions, partition_msg
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
EXPECTED_MSG_OUTPUT = [
|
|
|
|
NarrativeText(text="This is a test email to use for unit tests."),
|
2024-12-18 10:46:54 -08:00
|
|
|
Text(text="Important points:"),
|
2023-03-28 16:15:22 -04:00
|
|
|
ListItem(text="Roses are red"),
|
|
|
|
ListItem(text="Violets are blue"),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_from_filename():
|
2024-05-22 17:51:08 -07:00
|
|
|
filename = example_doc_path("fake-email.msg")
|
2023-03-28 16:15:22 -04:00
|
|
|
elements = partition_msg(filename=filename)
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
parent_id = elements[0].metadata.parent_id
|
|
|
|
|
2023-03-28 16:15:22 -04:00
|
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
2023-06-16 10:10:56 -04:00
|
|
|
assert (
|
|
|
|
elements[0].metadata.to_dict()
|
|
|
|
== ElementMetadata(
|
2023-07-05 11:25:11 -07:00
|
|
|
coordinates=None,
|
2023-06-16 10:10:56 -04:00
|
|
|
filename=filename,
|
2024-06-05 14:12:27 -07:00
|
|
|
last_modified="2023-03-28T17:00:31+00:00",
|
2023-06-16 10:10:56 -04:00
|
|
|
page_number=None,
|
|
|
|
url=None,
|
2024-06-05 14:12:27 -07:00
|
|
|
sent_from=['"Matthew Robinson" <mrobinson@unstructured.io>'],
|
|
|
|
sent_to=["mrobinson@unstructured.io"],
|
2023-06-16 10:10:56 -04:00
|
|
|
subject="Test Email",
|
|
|
|
filetype="application/vnd.ms-outlook",
|
Feat: Create a naive hierarchy for elements (#1268)
## **Summary**
By adding hierarchy to unstructured elements, users will have more
information for implementing vector db/LLM chunking strategies. For
example, text elements could be queried by their preceding title
element. The hierarchy is implemented by a parent_id tag in the
element's metadata.
### Features
- Introduces a parent_id to ElementMetadata (The id of the parent
element, not a pointer)
- Creates a rule set for assigning hierarchies. Sensible default is
assigned, with an optional override parameter
- Sets element parent ids if there isn't an existing parent id or
matches the ruleset
### How it works
Hierarchies are assigned via a parent id field in element metadata.
Elements are read sequentially and evaluated against a ruleset. For
example take the following elements:
1. Title, "This is the Title"
2. Text, "this is the text"
And the ruleset: `{"title": ["text"]}`. When evaluated, the parent_id of
2 will be the id of 1. The algorithm for determining this is more
complex and resolves several edge cases, so please read the code for
further details.
### Schema Changes
```
@dataclass
class ElementMetadata:
coordinates: Optional[CoordinatesMetadata] = None
data_source: Optional[DataSourceMetadata] = None
filename: Optional[str] = None
file_directory: Optional[str] = None
last_modified: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
+ parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
+ category_depth: Optional[int] = None
...
```
### Testing
```
from unstructured.partition.auto import partition
from typing import List
elements = partition(filename="./unstructured/example-docs/fake-html.html", strategy="auto")
for element in elements:
print(
f"Category: {getattr(element, 'category', '')}\n"\
f"Text: {getattr(element, 'text', '')}\n"
f"ID: {element.id}\n" \
f"Parent ID: {element.metadata.parent_id}\n"\
f"Depth: {element.metadata.category_depth}\n" \
)
```
### Additional Notes
Implementing this feature revealed a possibly undesired side-effect in
how element metadata are processed. In
`unstructured/partition/common.py` the `_add_element_metadata` is
invoked as part of the `add_metadata_with_filetype` decorator for
filetype partitioning. This method is intended to add additional
information to the metadata generated with the element including
filename and filetype, however the existing metadata is merged into a
newly created metadata object rather than the other way around. Because
of the way it's structured, new metadata fields can easily be forgotten
and pose debugging challenges to developers. This likely warrants a new
issue.
I'm guessing that the implementation is done this way to avoid issues
with deserializing elements, but could be wrong.
---------
Co-authored-by: Benjamin Torres <benjats07@users.noreply.github.com>
2023-09-14 11:23:16 -04:00
|
|
|
parent_id=parent_id,
|
2023-10-10 20:47:56 -05:00
|
|
|
languages=["eng"],
|
2023-06-16 10:10:56 -04:00
|
|
|
).to_dict()
|
2023-04-04 14:23:41 -04:00
|
|
|
)
|
2023-07-05 15:02:22 -05:00
|
|
|
|
|
|
|
|
2023-09-15 09:51:22 -07:00
|
|
|
def test_partition_msg_from_filename_returns_uns_elements():
|
2024-05-22 17:51:08 -07:00
|
|
|
filename = example_doc_path("fake-email.msg")
|
2023-09-15 09:51:22 -07:00
|
|
|
elements = partition_msg(filename=filename)
|
|
|
|
assert isinstance(elements[0], NarrativeText)
|
|
|
|
|
|
|
|
|
2023-07-05 15:02:22 -05:00
|
|
|
def test_partition_msg_from_filename_with_metadata_filename():
|
2024-05-22 17:51:08 -07:00
|
|
|
filename = example_doc_path("fake-email.msg")
|
2023-07-05 15:02:22 -05:00
|
|
|
elements = partition_msg(filename=filename, metadata_filename="test")
|
|
|
|
assert all(element.metadata.filename == "test" for element in elements)
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
|
2024-06-05 14:12:27 -07:00
|
|
|
def test_partition_msg_from_filename_with_text_content():
|
2024-05-22 17:51:08 -07:00
|
|
|
filename = example_doc_path("fake-email.msg")
|
2024-06-05 14:12:27 -07:00
|
|
|
|
2023-03-28 16:15:22 -04:00
|
|
|
elements = partition_msg(filename=filename)
|
2024-06-05 14:12:27 -07:00
|
|
|
|
|
|
|
assert str(elements[0]) == "This is a test email to use for unit tests."
|
2023-05-15 18:25:39 -04:00
|
|
|
assert elements[0].metadata.filename == "fake-email.msg"
|
2024-05-22 17:51:08 -07:00
|
|
|
assert elements[0].metadata.file_directory == example_doc_path("")
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_raises_with_missing_file():
|
2024-05-22 17:51:08 -07:00
|
|
|
filename = example_doc_path("doesnt-exist.msg")
|
2023-03-28 16:15:22 -04:00
|
|
|
with pytest.raises(FileNotFoundError):
|
|
|
|
partition_msg(filename=filename)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_from_file():
|
2024-05-22 17:51:08 -07:00
|
|
|
filename = example_doc_path("fake-email.msg")
|
2023-03-28 16:15:22 -04:00
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = partition_msg(file=f)
|
|
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename is None
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_from_file_with_metadata_filename():
|
2024-05-22 17:51:08 -07:00
|
|
|
filename = example_doc_path("fake-email.msg")
|
2023-07-05 15:02:22 -05:00
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = partition_msg(file=f, metadata_filename="test")
|
|
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "test"
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
|
2024-06-05 14:12:27 -07:00
|
|
|
def test_partition_msg_uses_file_path_when_both_are_specified():
|
|
|
|
elements = partition_msg(example_doc_path("fake-email.msg"), file=io.BytesIO(b"abcde"))
|
|
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
2023-03-28 16:15:22 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_raises_with_neither():
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
partition_msg()
|
2023-06-29 18:01:12 -04:00
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- attachments ---------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
2024-06-05 14:12:27 -07:00
|
|
|
def test_partition_msg_can_process_attachments():
|
2023-08-18 18:21:11 -05:00
|
|
|
elements = partition_msg(
|
2024-06-05 14:12:27 -07:00
|
|
|
example_doc_path("fake-email-multiple-attachments.msg"), process_attachments=True
|
2024-03-18 02:09:44 +01:00
|
|
|
)
|
|
|
|
|
2024-06-05 14:12:27 -07:00
|
|
|
assert all(e.metadata.filename == "fake-email-multiple-attachments.msg" for e in elements[:5])
|
|
|
|
assert all(e.metadata.filename == "unstructured_logo.png" for e in elements[5:7])
|
|
|
|
assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:343])
|
|
|
|
assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[343:])
|
|
|
|
assert [e.text for e in elements[:5]] == [
|
|
|
|
"Here are those documents.",
|
|
|
|
"--",
|
|
|
|
"Mallori Harrell",
|
|
|
|
"Unstructured Technologies",
|
|
|
|
"Data Scientist",
|
|
|
|
]
|
|
|
|
assert [type(e).__name__ for e in elements][:10] == [
|
2024-06-14 17:14:22 -07:00
|
|
|
"NarrativeText",
|
|
|
|
"Text",
|
2024-12-18 10:46:54 -08:00
|
|
|
"Text",
|
|
|
|
"Text",
|
|
|
|
"Text",
|
2024-06-05 14:12:27 -07:00
|
|
|
"Image",
|
2025-04-04 14:38:23 -07:00
|
|
|
"Text",
|
2024-06-05 14:12:27 -07:00
|
|
|
"Text",
|
|
|
|
"Title",
|
|
|
|
"Title",
|
|
|
|
]
|
|
|
|
assert [type(e).__name__ for e in elements][-10:] == [
|
|
|
|
"Title",
|
|
|
|
"ListItem",
|
|
|
|
"ListItem",
|
|
|
|
"ListItem",
|
|
|
|
"ListItem",
|
|
|
|
"ListItem",
|
|
|
|
"ListItem",
|
|
|
|
"ListItem",
|
|
|
|
"ListItem",
|
|
|
|
"ListItem",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2024-10-15 19:02:33 -07:00
|
|
|
def test_partition_msg_silently_skips_attachments_it_cannot_partition(request: FixtureRequest):
|
|
|
|
function_mock(
|
|
|
|
request, "unstructured.partition.auto.partition", side_effect=UnsupportedFileFormatError()
|
|
|
|
)
|
|
|
|
|
|
|
|
elements = partition_msg(
|
|
|
|
example_doc_path("fake-email-multiple-attachments.msg"), process_attachments=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# -- no exception is raised --
|
|
|
|
assert elements == [
|
|
|
|
# -- the email body is partitioned --
|
|
|
|
NarrativeText("Here are those documents."),
|
|
|
|
Text("--"),
|
2024-12-18 10:46:54 -08:00
|
|
|
Text("Mallori Harrell"),
|
|
|
|
Text("Unstructured Technologies"),
|
|
|
|
Text("Data Scientist"),
|
2024-10-15 19:02:33 -07:00
|
|
|
# -- no elements appear for the attachment(s) --
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
# -- .metadata.filename --------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_from_filename_gets_filename_metadata_from_file_path():
|
|
|
|
elements = partition_msg(example_doc_path("fake-email.msg"))
|
|
|
|
|
|
|
|
assert all(e.metadata.filename == "fake-email.msg" for e in elements)
|
|
|
|
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_from_file_gets_filename_metadata_None():
|
|
|
|
with open(example_doc_path("fake-email.msg"), "rb") as f:
|
|
|
|
elements = partition_msg(file=f)
|
|
|
|
|
|
|
|
assert all(e.metadata.filename is None for e in elements)
|
|
|
|
assert all(e.metadata.file_directory is None for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_from_filename_prefers_metadata_filename():
|
|
|
|
elements = partition_msg(example_doc_path("fake-email.msg"), metadata_filename="a/b/c.msg")
|
|
|
|
|
|
|
|
assert all(e.metadata.filename == "c.msg" for e in elements)
|
|
|
|
assert all(e.metadata.file_directory == "a/b" for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_from_file_prefers_metadata_filename():
|
|
|
|
with open(example_doc_path("fake-email.msg"), "rb") as f:
|
|
|
|
elements = partition_msg(file=f, metadata_filename="d/e/f.msg")
|
|
|
|
|
|
|
|
assert all(e.metadata.filename == "f.msg" for e in elements)
|
|
|
|
assert all(e.metadata.file_directory == "d/e" for e in elements)
|
|
|
|
|
|
|
|
|
|
|
|
# -- .metadata.filetype --------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_gets_the_MSG_mime_type_in_metadata_filetype():
|
|
|
|
MSG_MIME_TYPE = "application/vnd.ms-outlook"
|
|
|
|
elements = partition_msg(example_doc_path("fake-email.msg"))
|
|
|
|
assert all(e.metadata.filetype == MSG_MIME_TYPE for e in elements), (
|
|
|
|
f"Expected all elements to have '{MSG_MIME_TYPE}' as their filetype, but got:"
|
|
|
|
f" {repr(elements[0].metadata.filetype)}"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
# -- .metadata.last_modified ---------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
2024-06-05 14:12:27 -07:00
|
|
|
def test_partition_msg_pulls_last_modified_from_message_sent_date():
|
2024-05-29 14:36:05 -07:00
|
|
|
elements = partition_msg(example_doc_path("fake-email.msg"))
|
2024-06-05 14:12:27 -07:00
|
|
|
assert all(e.metadata.last_modified == "2023-03-28T17:00:31+00:00" for e in elements)
|
2024-03-18 02:09:44 +01:00
|
|
|
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
def test_partition_msg_from_file_path_prefers_metadata_last_modified():
|
|
|
|
metadata_last_modified = "2020-07-05T09:24:28"
|
|
|
|
|
|
|
|
elements = partition_msg(
|
|
|
|
example_doc_path("fake-email.msg"), metadata_last_modified=metadata_last_modified
|
|
|
|
)
|
|
|
|
|
|
|
|
assert elements[0].metadata.last_modified == metadata_last_modified
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_from_file_prefers_metadata_last_modified():
|
2024-06-05 14:12:27 -07:00
|
|
|
metadata_last_modified = "2020-07-05T09:24:28"
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2024-05-29 14:36:05 -07:00
|
|
|
with open(example_doc_path("fake-email.msg"), "rb") as f:
|
2024-06-05 14:12:27 -07:00
|
|
|
elements = partition_msg(file=f, metadata_last_modified=metadata_last_modified)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2024-06-05 14:12:27 -07:00
|
|
|
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2024-09-23 15:23:10 -07:00
|
|
|
# ------------------------------------------------------------------------------------------------
|
2023-08-25 20:09:25 -04:00
|
|
|
|
|
|
|
|
2023-08-29 16:59:26 -04:00
|
|
|
def test_partition_msg_with_json():
|
2023-10-12 12:47:55 -07:00
|
|
|
elements = partition_msg(example_doc_path("fake-email.msg"))
|
|
|
|
assert_round_trips_through_JSON(elements)
|
2023-08-29 16:59:26 -04:00
|
|
|
|
|
|
|
|
2024-05-29 14:36:05 -07:00
|
|
|
def test_partition_msg_with_pgp_encrypted_message(caplog: LogCaptureFixture):
|
2024-05-22 17:51:08 -07:00
|
|
|
elements = partition_msg(example_doc_path("fake-encrypted.msg"))
|
2023-08-25 20:09:25 -04:00
|
|
|
|
|
|
|
assert elements == []
|
|
|
|
assert "WARNING" in caplog.text
|
|
|
|
assert "Encrypted email detected" in caplog.text
|
2023-09-11 16:00:14 -05:00
|
|
|
|
|
|
|
|
2024-05-29 14:36:05 -07:00
|
|
|
def test_add_chunking_strategy_by_title_on_partition_msg():
|
|
|
|
filename = example_doc_path("fake-email.msg")
|
|
|
|
|
2023-09-11 16:00:14 -05:00
|
|
|
elements = partition_msg(filename=filename)
|
|
|
|
chunk_elements = partition_msg(filename, chunking_strategy="by_title")
|
|
|
|
chunks = chunk_by_title(elements)
|
2024-05-29 14:36:05 -07:00
|
|
|
|
2023-09-11 16:00:14 -05:00
|
|
|
assert chunk_elements != elements
|
|
|
|
assert chunk_elements == chunks
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
2024-06-05 14:12:27 -07:00
|
|
|
# -- language behaviors --------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
2023-10-10 20:47:56 -05:00
|
|
|
def test_partition_msg_element_metadata_has_languages():
|
|
|
|
filename = "example-docs/fake-email.msg"
|
|
|
|
elements = partition_msg(filename=filename)
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_respects_languages_arg():
|
|
|
|
filename = "example-docs/fake-email.msg"
|
|
|
|
elements = partition_msg(filename=filename, languages=["deu"])
|
|
|
|
assert all(element.metadata.languages == ["deu"] for element in elements)
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_msg_raises_TypeError_for_invalid_languages():
|
|
|
|
with pytest.raises(TypeError):
|
|
|
|
filename = "example-docs/fake-email.msg"
|
2024-06-05 14:12:27 -07:00
|
|
|
partition_msg(filename=filename, languages="eng")
|
|
|
|
|
|
|
|
|
|
|
|
# ================================================================================================
|
|
|
|
# ISOLATED UNIT TESTS
|
|
|
|
# ================================================================================================
|
|
|
|
# These test components used by `partition_msg()` in isolation such that all edge cases can be
|
|
|
|
# exercised.
|
|
|
|
# ================================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
class DescribeMsgPartitionerOptions:
|
|
|
|
"""Unit-test suite for `unstructured.partition.msg.MsgPartitionerOptions` objects."""
|
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
# -- .extra_msg_metadata ---------------------
|
|
|
|
|
|
|
|
def it_provides_email_specific_metadata_to_add_to_each_element(self, opts_args: dict[str, Any]):
|
|
|
|
opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg")
|
|
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
m = opts.extra_msg_metadata
|
|
|
|
assert m.bcc_recipient == ["hello@unstructured.io"]
|
|
|
|
assert m.cc_recipient == ["steve@unstructured.io"]
|
|
|
|
assert m.email_message_id == "14DDEF33-2BA7-4CDD-A4D8-E7C5873B37F2@gmail.com"
|
|
|
|
assert m.sent_from == ['"John" <johnjennings702@gmail.com>']
|
|
|
|
assert m.sent_to == [
|
|
|
|
"john-ctr@unstructured.io",
|
|
|
|
"steve@unstructured.io",
|
|
|
|
"hello@unstructured.io",
|
|
|
|
]
|
|
|
|
assert m.subject == "Fake email with cc and bcc recipients"
|
|
|
|
|
2024-06-05 14:12:27 -07:00
|
|
|
# -- .is_encrypted ---------------------------
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("file_name", "expected_value"), [("fake-encrypted.msg", True), ("fake-email.msg", False)]
|
|
|
|
)
|
|
|
|
def it_knows_when_the_msg_is_encrypted(
|
|
|
|
self, file_name: str, expected_value: bool, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts_args["file_path"] = example_doc_path(file_name)
|
|
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
assert opts.is_encrypted is expected_value
|
|
|
|
|
|
|
|
# -- .metadata_file_path ---------------------
|
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
def it_uses_the_metadata_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
|
2024-06-05 14:12:27 -07:00
|
|
|
opts_args["file_path"] = "x/y/z.msg"
|
|
|
|
opts_args["metadata_file_path"] = "a/b/c.msg"
|
|
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
assert opts.metadata_file_path == "a/b/c.msg"
|
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
def and_it_falls_back_to_the_MSG_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
|
|
|
|
file_path = example_doc_path("fake-email.msg")
|
2024-06-05 14:12:27 -07:00
|
|
|
opts_args["file_path"] = file_path
|
|
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
assert opts.metadata_file_path == file_path
|
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
def but_it_returns_None_when_neither_path_is_available(self, opts_args: dict[str, Any]):
|
2024-06-05 14:12:27 -07:00
|
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
assert opts.metadata_file_path is None
|
2024-06-05 14:12:27 -07:00
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
# -- .metadata_last_modified -----------------
|
2024-06-05 14:12:27 -07:00
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
def it_uses_metadata_last_modified_when_provided_by_the_caller(self, opts_args: dict[str, Any]):
|
|
|
|
metadata_last_modified = "2024-03-05T17:02:53"
|
|
|
|
opts_args["metadata_last_modified"] = metadata_last_modified
|
2024-06-05 14:12:27 -07:00
|
|
|
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
|
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
assert opts.metadata_last_modified == metadata_last_modified
|
2024-06-05 14:12:27 -07:00
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
def and_it_uses_the_message_Date_header_when_metadata_last_modified_is_not_provided(
|
2024-06-05 14:12:27 -07:00
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
|
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
assert opts.metadata_last_modified == "2023-03-28T17:00:31+00:00"
|
2024-06-05 14:12:27 -07:00
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
@pytest.mark.parametrize("filesystem_last_modified", ["2024-06-03T20:12:53", None])
|
2024-06-05 14:12:27 -07:00
|
|
|
def and_it_uses_the_last_modified_date_from_the_source_file_when_the_message_has_no_sent_date(
|
|
|
|
self,
|
|
|
|
opts_args: dict[str, Any],
|
2024-10-04 14:01:32 -07:00
|
|
|
filesystem_last_modified: str | None,
|
2024-06-05 14:12:27 -07:00
|
|
|
Message_sent_date_: Mock,
|
|
|
|
_last_modified_prop_: Mock,
|
|
|
|
):
|
|
|
|
Message_sent_date_.return_value = None
|
2024-10-04 14:01:32 -07:00
|
|
|
_last_modified_prop_.return_value = filesystem_last_modified
|
2024-06-05 14:12:27 -07:00
|
|
|
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
|
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
assert opts.metadata_last_modified == filesystem_last_modified
|
2024-06-05 14:12:27 -07:00
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
# -- .msg ------------------------------------
|
2024-06-05 14:12:27 -07:00
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
def it_loads_the_msg_document_from_a_file_path_when_provided(self, opts_args: dict[str, Any]):
|
2024-06-05 14:12:27 -07:00
|
|
|
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
|
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
assert isinstance(opts.msg, Message)
|
2024-06-05 14:12:27 -07:00
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
def and_it_loads_the_msg_document_from_a_file_like_object_when_provided(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
with open(example_doc_path("fake-email.msg"), "rb") as f:
|
|
|
|
opts_args["file"] = io.BytesIO(f.read())
|
2024-08-01 15:24:17 -04:00
|
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
assert isinstance(opts.msg, Message)
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
def but_it_raises_when_neither_is_provided(self, opts_args: dict[str, Any]):
|
|
|
|
with pytest.raises(ValueError, match="one of `file` or `filename` arguments must be prov"):
|
|
|
|
MsgPartitionerOptions(**opts_args).msg
|
2024-08-01 15:24:17 -04:00
|
|
|
|
2024-06-05 14:12:27 -07:00
|
|
|
# -- .partition_attachments ------------------
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("partition_attachments", [True, False])
|
|
|
|
def it_knows_whether_attachments_should_also_be_partitioned(
|
|
|
|
self, partition_attachments: bool, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
|
|
|
opts_args["partition_attachments"] = partition_attachments
|
|
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
assert opts.partition_attachments is partition_attachments
|
|
|
|
|
2024-10-04 14:01:32 -07:00
|
|
|
# -- .partitioning_kwargs --------------------
|
|
|
|
|
|
|
|
def it_provides_access_to_pass_through_kwargs_collected_by_the_partitioner_function(
|
|
|
|
self, opts_args: dict[str, Any]
|
|
|
|
):
|
|
|
|
opts_args["kwargs"] = {"foo": 42, "bar": "baz"}
|
|
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
|
|
|
|
assert opts.partitioning_kwargs == {"foo": 42, "bar": "baz"}
|
|
|
|
|
2024-06-05 14:12:27 -07:00
|
|
|
# -- fixtures --------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def _last_modified_prop_(self, request: FixtureRequest):
|
|
|
|
return property_mock(request, MsgPartitionerOptions, "_last_modified")
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def Message_sent_date_(self, request: FixtureRequest):
|
|
|
|
return property_mock(request, Message, "sent_date")
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def opts_args(self) -> dict[str, Any]:
|
|
|
|
"""All default arguments for `MsgPartitionerOptions`.
|
|
|
|
|
|
|
|
Individual argument values can be changed to suit each test. Makes construction of opts more
|
|
|
|
compact for testing purposes.
|
|
|
|
"""
|
|
|
|
return {
|
|
|
|
"file": None,
|
|
|
|
"file_path": None,
|
|
|
|
"metadata_file_path": None,
|
|
|
|
"metadata_last_modified": None,
|
|
|
|
"partition_attachments": False,
|
2024-10-04 14:01:32 -07:00
|
|
|
"kwargs": {},
|
2024-06-05 14:12:27 -07:00
|
|
|
}
|