mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

**Summary** Initial attempts to incrementally refactor `partition_email()` into shape to allow pluggable partitioning quickly became too complex for ready code-review. Prepare separate rewritten module and tests and swap them out whole. **Additional Context** - Uses the modern stdlib `email` module to reliably accomplish several manual decoding steps in the legacy code. - Remove obsolete email-specific element-types which were replaced 18 months or so ago with email-specific metadata fields for things like Cc: addresses, subject, etc. - Remove accepting an email as `text: str` because MIME-email is inherently a binary format which can and often does contain multiple and contradictory character-encodings. - Remove `encoding` parameters as it is now unused. An email file is not a text file and as such does not have a single overall encoding. Character encoding is specified individually for each MIME-part within the message and often varies from one part to another in the same message. - Remove the need for a caller to specify `attachment_partitioner`. There is only one reasonable choice for this which is `auto.partition()`, consistent with the same interface and operation in `partition_msg()`. - Fixes #3671 along the way by silently skipping attachments with a file-type for which there is no partitioner. - Substantially extend the test-suite to cover multiple transport-encoding/charset combinations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
469 lines
17 KiB
Python
469 lines
17 KiB
Python
"""Test suite for `unstructured.partition.msg` module."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
from typing import Any
|
|
|
|
import pytest
|
|
from oxmsg import Message
|
|
|
|
from test_unstructured.unit_utils import (
|
|
FixtureRequest,
|
|
LogCaptureFixture,
|
|
Mock,
|
|
assert_round_trips_through_JSON,
|
|
example_doc_path,
|
|
function_mock,
|
|
property_mock,
|
|
)
|
|
from unstructured.chunking.title import chunk_by_title
|
|
from unstructured.documents.elements import (
|
|
ElementMetadata,
|
|
ListItem,
|
|
NarrativeText,
|
|
Text,
|
|
Title,
|
|
)
|
|
from unstructured.partition.common import UnsupportedFileFormatError
|
|
from unstructured.partition.msg import MsgPartitionerOptions, partition_msg
|
|
|
|
EXPECTED_MSG_OUTPUT = [
|
|
NarrativeText(text="This is a test email to use for unit tests."),
|
|
Title(text="Important points:"),
|
|
ListItem(text="Roses are red"),
|
|
ListItem(text="Violets are blue"),
|
|
]
|
|
|
|
|
|
def test_partition_msg_from_filename():
|
|
filename = example_doc_path("fake-email.msg")
|
|
elements = partition_msg(filename=filename)
|
|
parent_id = elements[0].metadata.parent_id
|
|
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
|
assert (
|
|
elements[0].metadata.to_dict()
|
|
== ElementMetadata(
|
|
coordinates=None,
|
|
filename=filename,
|
|
last_modified="2023-03-28T17:00:31+00:00",
|
|
page_number=None,
|
|
url=None,
|
|
sent_from=['"Matthew Robinson" <mrobinson@unstructured.io>'],
|
|
sent_to=["mrobinson@unstructured.io"],
|
|
subject="Test Email",
|
|
filetype="application/vnd.ms-outlook",
|
|
parent_id=parent_id,
|
|
languages=["eng"],
|
|
).to_dict()
|
|
)
|
|
|
|
|
|
def test_partition_msg_from_filename_returns_uns_elements():
|
|
filename = example_doc_path("fake-email.msg")
|
|
elements = partition_msg(filename=filename)
|
|
assert isinstance(elements[0], NarrativeText)
|
|
|
|
|
|
def test_partition_msg_from_filename_with_metadata_filename():
|
|
filename = example_doc_path("fake-email.msg")
|
|
elements = partition_msg(filename=filename, metadata_filename="test")
|
|
assert all(element.metadata.filename == "test" for element in elements)
|
|
|
|
|
|
def test_partition_msg_from_filename_with_text_content():
|
|
filename = example_doc_path("fake-email.msg")
|
|
|
|
elements = partition_msg(filename=filename)
|
|
|
|
assert str(elements[0]) == "This is a test email to use for unit tests."
|
|
assert elements[0].metadata.filename == "fake-email.msg"
|
|
assert elements[0].metadata.file_directory == example_doc_path("")
|
|
|
|
|
|
def test_partition_msg_raises_with_missing_file():
|
|
filename = example_doc_path("doesnt-exist.msg")
|
|
with pytest.raises(FileNotFoundError):
|
|
partition_msg(filename=filename)
|
|
|
|
|
|
def test_partition_msg_from_file():
|
|
filename = example_doc_path("fake-email.msg")
|
|
with open(filename, "rb") as f:
|
|
elements = partition_msg(file=f)
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
|
for element in elements:
|
|
assert element.metadata.filename is None
|
|
|
|
|
|
def test_partition_msg_from_file_with_metadata_filename():
|
|
filename = example_doc_path("fake-email.msg")
|
|
with open(filename, "rb") as f:
|
|
elements = partition_msg(file=f, metadata_filename="test")
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
|
for element in elements:
|
|
assert element.metadata.filename == "test"
|
|
|
|
|
|
def test_partition_msg_uses_file_path_when_both_are_specified():
|
|
elements = partition_msg(example_doc_path("fake-email.msg"), file=io.BytesIO(b"abcde"))
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
|
|
|
|
|
def test_partition_msg_raises_with_neither():
|
|
with pytest.raises(ValueError):
|
|
partition_msg()
|
|
|
|
|
|
# -- attachments ---------------------------------------------------------------------------------
|
|
|
|
|
|
def test_partition_msg_can_process_attachments():
|
|
elements = partition_msg(
|
|
example_doc_path("fake-email-multiple-attachments.msg"), process_attachments=True
|
|
)
|
|
|
|
assert all(e.metadata.filename == "fake-email-multiple-attachments.msg" for e in elements[:5])
|
|
assert all(e.metadata.filename == "unstructured_logo.png" for e in elements[5:7])
|
|
assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:343])
|
|
assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[343:])
|
|
assert [e.text for e in elements[:5]] == [
|
|
"Here are those documents.",
|
|
"--",
|
|
"Mallori Harrell",
|
|
"Unstructured Technologies",
|
|
"Data Scientist",
|
|
]
|
|
assert [type(e).__name__ for e in elements][:10] == [
|
|
"NarrativeText",
|
|
"Text",
|
|
"Title",
|
|
"Title",
|
|
"Title",
|
|
"Image",
|
|
"Title",
|
|
"Text",
|
|
"Title",
|
|
"Title",
|
|
]
|
|
assert [type(e).__name__ for e in elements][-10:] == [
|
|
"Title",
|
|
"ListItem",
|
|
"ListItem",
|
|
"ListItem",
|
|
"ListItem",
|
|
"ListItem",
|
|
"ListItem",
|
|
"ListItem",
|
|
"ListItem",
|
|
"ListItem",
|
|
]
|
|
|
|
|
|
def test_partition_msg_silently_skips_attachments_it_cannot_partition(request: FixtureRequest):
|
|
function_mock(
|
|
request, "unstructured.partition.auto.partition", side_effect=UnsupportedFileFormatError()
|
|
)
|
|
|
|
elements = partition_msg(
|
|
example_doc_path("fake-email-multiple-attachments.msg"), process_attachments=True
|
|
)
|
|
|
|
# -- no exception is raised --
|
|
assert elements == [
|
|
# -- the email body is partitioned --
|
|
NarrativeText("Here are those documents."),
|
|
Text("--"),
|
|
Title("Mallori Harrell"),
|
|
Title("Unstructured Technologies"),
|
|
Title("Data Scientist"),
|
|
# -- no elements appear for the attachment(s) --
|
|
]
|
|
|
|
|
|
# -- .metadata.filename --------------------------------------------------------------------------
|
|
|
|
|
|
def test_partition_msg_from_filename_gets_filename_metadata_from_file_path():
|
|
elements = partition_msg(example_doc_path("fake-email.msg"))
|
|
|
|
assert all(e.metadata.filename == "fake-email.msg" for e in elements)
|
|
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
|
|
|
|
|
def test_partition_msg_from_file_gets_filename_metadata_None():
|
|
with open(example_doc_path("fake-email.msg"), "rb") as f:
|
|
elements = partition_msg(file=f)
|
|
|
|
assert all(e.metadata.filename is None for e in elements)
|
|
assert all(e.metadata.file_directory is None for e in elements)
|
|
|
|
|
|
def test_partition_msg_from_filename_prefers_metadata_filename():
|
|
elements = partition_msg(example_doc_path("fake-email.msg"), metadata_filename="a/b/c.msg")
|
|
|
|
assert all(e.metadata.filename == "c.msg" for e in elements)
|
|
assert all(e.metadata.file_directory == "a/b" for e in elements)
|
|
|
|
|
|
def test_partition_msg_from_file_prefers_metadata_filename():
|
|
with open(example_doc_path("fake-email.msg"), "rb") as f:
|
|
elements = partition_msg(file=f, metadata_filename="d/e/f.msg")
|
|
|
|
assert all(e.metadata.filename == "f.msg" for e in elements)
|
|
assert all(e.metadata.file_directory == "d/e" for e in elements)
|
|
|
|
|
|
# -- .metadata.filetype --------------------------------------------------------------------------
|
|
|
|
|
|
def test_partition_msg_gets_the_MSG_mime_type_in_metadata_filetype():
|
|
MSG_MIME_TYPE = "application/vnd.ms-outlook"
|
|
elements = partition_msg(example_doc_path("fake-email.msg"))
|
|
assert all(e.metadata.filetype == MSG_MIME_TYPE for e in elements), (
|
|
f"Expected all elements to have '{MSG_MIME_TYPE}' as their filetype, but got:"
|
|
f" {repr(elements[0].metadata.filetype)}"
|
|
)
|
|
|
|
|
|
# -- .metadata.last_modified ---------------------------------------------------------------------
|
|
|
|
|
|
def test_partition_msg_pulls_last_modified_from_message_sent_date():
|
|
elements = partition_msg(example_doc_path("fake-email.msg"))
|
|
assert all(e.metadata.last_modified == "2023-03-28T17:00:31+00:00" for e in elements)
|
|
|
|
|
|
def test_partition_msg_from_file_path_prefers_metadata_last_modified():
|
|
metadata_last_modified = "2020-07-05T09:24:28"
|
|
|
|
elements = partition_msg(
|
|
example_doc_path("fake-email.msg"), metadata_last_modified=metadata_last_modified
|
|
)
|
|
|
|
assert elements[0].metadata.last_modified == metadata_last_modified
|
|
|
|
|
|
def test_partition_msg_from_file_prefers_metadata_last_modified():
|
|
metadata_last_modified = "2020-07-05T09:24:28"
|
|
|
|
with open(example_doc_path("fake-email.msg"), "rb") as f:
|
|
elements = partition_msg(file=f, metadata_last_modified=metadata_last_modified)
|
|
|
|
assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
|
|
|
|
|
|
# ------------------------------------------------------------------------------------------------
|
|
|
|
|
|
def test_partition_msg_with_json():
|
|
elements = partition_msg(example_doc_path("fake-email.msg"))
|
|
assert_round_trips_through_JSON(elements)
|
|
|
|
|
|
def test_partition_msg_with_pgp_encrypted_message(caplog: LogCaptureFixture):
|
|
elements = partition_msg(example_doc_path("fake-encrypted.msg"))
|
|
|
|
assert elements == []
|
|
assert "WARNING" in caplog.text
|
|
assert "Encrypted email detected" in caplog.text
|
|
|
|
|
|
def test_add_chunking_strategy_by_title_on_partition_msg():
|
|
filename = example_doc_path("fake-email.msg")
|
|
|
|
elements = partition_msg(filename=filename)
|
|
chunk_elements = partition_msg(filename, chunking_strategy="by_title")
|
|
chunks = chunk_by_title(elements)
|
|
|
|
assert chunk_elements != elements
|
|
assert chunk_elements == chunks
|
|
|
|
|
|
# -- language behaviors --------------------------------------------------------------------------
|
|
|
|
|
|
def test_partition_msg_element_metadata_has_languages():
|
|
filename = "example-docs/fake-email.msg"
|
|
elements = partition_msg(filename=filename)
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
|
|
def test_partition_msg_respects_languages_arg():
|
|
filename = "example-docs/fake-email.msg"
|
|
elements = partition_msg(filename=filename, languages=["deu"])
|
|
assert all(element.metadata.languages == ["deu"] for element in elements)
|
|
|
|
|
|
def test_partition_msg_raises_TypeError_for_invalid_languages():
|
|
with pytest.raises(TypeError):
|
|
filename = "example-docs/fake-email.msg"
|
|
partition_msg(filename=filename, languages="eng")
|
|
|
|
|
|
# ================================================================================================
|
|
# ISOLATED UNIT TESTS
|
|
# ================================================================================================
|
|
# These test components used by `partition_msg()` in isolation such that all edge cases can be
|
|
# exercised.
|
|
# ================================================================================================
|
|
|
|
|
|
class DescribeMsgPartitionerOptions:
|
|
"""Unit-test suite for `unstructured.partition.msg.MsgPartitionerOptions` objects."""
|
|
|
|
# -- .extra_msg_metadata ---------------------
|
|
|
|
def it_provides_email_specific_metadata_to_add_to_each_element(self, opts_args: dict[str, Any]):
|
|
opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg")
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
m = opts.extra_msg_metadata
|
|
assert m.bcc_recipient == ["hello@unstructured.io"]
|
|
assert m.cc_recipient == ["steve@unstructured.io"]
|
|
assert m.email_message_id == "14DDEF33-2BA7-4CDD-A4D8-E7C5873B37F2@gmail.com"
|
|
assert m.sent_from == ['"John" <johnjennings702@gmail.com>']
|
|
assert m.sent_to == [
|
|
"john-ctr@unstructured.io",
|
|
"steve@unstructured.io",
|
|
"hello@unstructured.io",
|
|
]
|
|
assert m.subject == "Fake email with cc and bcc recipients"
|
|
|
|
# -- .is_encrypted ---------------------------
|
|
|
|
@pytest.mark.parametrize(
|
|
("file_name", "expected_value"), [("fake-encrypted.msg", True), ("fake-email.msg", False)]
|
|
)
|
|
def it_knows_when_the_msg_is_encrypted(
|
|
self, file_name: str, expected_value: bool, opts_args: dict[str, Any]
|
|
):
|
|
opts_args["file_path"] = example_doc_path(file_name)
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
assert opts.is_encrypted is expected_value
|
|
|
|
# -- .metadata_file_path ---------------------
|
|
|
|
def it_uses_the_metadata_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
|
|
opts_args["file_path"] = "x/y/z.msg"
|
|
opts_args["metadata_file_path"] = "a/b/c.msg"
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
assert opts.metadata_file_path == "a/b/c.msg"
|
|
|
|
def and_it_falls_back_to_the_MSG_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
|
|
file_path = example_doc_path("fake-email.msg")
|
|
opts_args["file_path"] = file_path
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
assert opts.metadata_file_path == file_path
|
|
|
|
def but_it_returns_None_when_neither_path_is_available(self, opts_args: dict[str, Any]):
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
assert opts.metadata_file_path is None
|
|
|
|
# -- .metadata_last_modified -----------------
|
|
|
|
def it_uses_metadata_last_modified_when_provided_by_the_caller(self, opts_args: dict[str, Any]):
|
|
metadata_last_modified = "2024-03-05T17:02:53"
|
|
opts_args["metadata_last_modified"] = metadata_last_modified
|
|
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
assert opts.metadata_last_modified == metadata_last_modified
|
|
|
|
def and_it_uses_the_message_Date_header_when_metadata_last_modified_is_not_provided(
|
|
self, opts_args: dict[str, Any]
|
|
):
|
|
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
assert opts.metadata_last_modified == "2023-03-28T17:00:31+00:00"
|
|
|
|
@pytest.mark.parametrize("filesystem_last_modified", ["2024-06-03T20:12:53", None])
|
|
def and_it_uses_the_last_modified_date_from_the_source_file_when_the_message_has_no_sent_date(
|
|
self,
|
|
opts_args: dict[str, Any],
|
|
filesystem_last_modified: str | None,
|
|
Message_sent_date_: Mock,
|
|
_last_modified_prop_: Mock,
|
|
):
|
|
Message_sent_date_.return_value = None
|
|
_last_modified_prop_.return_value = filesystem_last_modified
|
|
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
assert opts.metadata_last_modified == filesystem_last_modified
|
|
|
|
# -- .msg ------------------------------------
|
|
|
|
def it_loads_the_msg_document_from_a_file_path_when_provided(self, opts_args: dict[str, Any]):
|
|
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
assert isinstance(opts.msg, Message)
|
|
|
|
def and_it_loads_the_msg_document_from_a_file_like_object_when_provided(
|
|
self, opts_args: dict[str, Any]
|
|
):
|
|
with open(example_doc_path("fake-email.msg"), "rb") as f:
|
|
opts_args["file"] = io.BytesIO(f.read())
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
assert isinstance(opts.msg, Message)
|
|
|
|
def but_it_raises_when_neither_is_provided(self, opts_args: dict[str, Any]):
|
|
with pytest.raises(ValueError, match="one of `file` or `filename` arguments must be prov"):
|
|
MsgPartitionerOptions(**opts_args).msg
|
|
|
|
# -- .partition_attachments ------------------
|
|
|
|
@pytest.mark.parametrize("partition_attachments", [True, False])
|
|
def it_knows_whether_attachments_should_also_be_partitioned(
|
|
self, partition_attachments: bool, opts_args: dict[str, Any]
|
|
):
|
|
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
|
opts_args["partition_attachments"] = partition_attachments
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
assert opts.partition_attachments is partition_attachments
|
|
|
|
# -- .partitioning_kwargs --------------------
|
|
|
|
def it_provides_access_to_pass_through_kwargs_collected_by_the_partitioner_function(
|
|
self, opts_args: dict[str, Any]
|
|
):
|
|
opts_args["kwargs"] = {"foo": 42, "bar": "baz"}
|
|
opts = MsgPartitionerOptions(**opts_args)
|
|
|
|
assert opts.partitioning_kwargs == {"foo": 42, "bar": "baz"}
|
|
|
|
# -- fixtures --------------------------------------------------------------------------------
|
|
|
|
@pytest.fixture
|
|
def _last_modified_prop_(self, request: FixtureRequest):
|
|
return property_mock(request, MsgPartitionerOptions, "_last_modified")
|
|
|
|
@pytest.fixture
|
|
def Message_sent_date_(self, request: FixtureRequest):
|
|
return property_mock(request, Message, "sent_date")
|
|
|
|
@pytest.fixture
|
|
def opts_args(self) -> dict[str, Any]:
|
|
"""All default arguments for `MsgPartitionerOptions`.
|
|
|
|
Individual argument values can be changed to suit each test. Makes construction of opts more
|
|
compact for testing purposes.
|
|
"""
|
|
return {
|
|
"file": None,
|
|
"file_path": None,
|
|
"metadata_file_path": None,
|
|
"metadata_last_modified": None,
|
|
"partition_attachments": False,
|
|
"kwargs": {},
|
|
}
|