mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-30 17:38:13 +00:00 
			
		
		
		
	 1eceac26c8
			
		
	
	
		1eceac26c8
		
			
		
	
	
	
	
		
			
			**Summary** Initial attempts to incrementally refactor `partition_email()` into shape to allow pluggable partitioning quickly became too complex for ready code-review. Prepare separate rewritten module and tests and swap them out whole. **Additional Context** - Uses the modern stdlib `email` module to reliably accomplish several manual decoding steps in the legacy code. - Remove obsolete email-specific element-types which were replaced 18 months or so ago with email-specific metadata fields for things like Cc: addresses, subject, etc. - Remove accepting an email as `text: str` because MIME-email is inherently a binary format which can and often does contain multiple and contradictory character-encodings. - Remove `encoding` parameters as it is now unused. An email file is not a text file and as such does not have a single overall encoding. Character encoding is specified individually for each MIME-part within the message and often varies from one part to another in the same message. - Remove the need for a caller to specify `attachment_partitioner`. There is only one reasonable choice for this which is `auto.partition()`, consistent with the same interface and operation in `partition_msg()`. - Fixes #3671 along the way by silently skipping attachments with a file-type for which there is no partitioner. - Substantially extend the test-suite to cover multiple transport-encoding/charset combinations. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
		
			
				
	
	
		
			469 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			469 lines
		
	
	
		
			17 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """Test suite for `unstructured.partition.msg` module."""
 | |
| 
 | |
| from __future__ import annotations
 | |
| 
 | |
| import io
 | |
| from typing import Any
 | |
| 
 | |
| import pytest
 | |
| from oxmsg import Message
 | |
| 
 | |
| from test_unstructured.unit_utils import (
 | |
|     FixtureRequest,
 | |
|     LogCaptureFixture,
 | |
|     Mock,
 | |
|     assert_round_trips_through_JSON,
 | |
|     example_doc_path,
 | |
|     function_mock,
 | |
|     property_mock,
 | |
| )
 | |
| from unstructured.chunking.title import chunk_by_title
 | |
| from unstructured.documents.elements import (
 | |
|     ElementMetadata,
 | |
|     ListItem,
 | |
|     NarrativeText,
 | |
|     Text,
 | |
|     Title,
 | |
| )
 | |
| from unstructured.partition.common import UnsupportedFileFormatError
 | |
| from unstructured.partition.msg import MsgPartitionerOptions, partition_msg
 | |
| 
 | |
| EXPECTED_MSG_OUTPUT = [
 | |
|     NarrativeText(text="This is a test email to use for unit tests."),
 | |
|     Title(text="Important points:"),
 | |
|     ListItem(text="Roses are red"),
 | |
|     ListItem(text="Violets are blue"),
 | |
| ]
 | |
| 
 | |
| 
 | |
| def test_partition_msg_from_filename():
 | |
|     filename = example_doc_path("fake-email.msg")
 | |
|     elements = partition_msg(filename=filename)
 | |
|     parent_id = elements[0].metadata.parent_id
 | |
| 
 | |
|     assert elements == EXPECTED_MSG_OUTPUT
 | |
|     assert (
 | |
|         elements[0].metadata.to_dict()
 | |
|         == ElementMetadata(
 | |
|             coordinates=None,
 | |
|             filename=filename,
 | |
|             last_modified="2023-03-28T17:00:31+00:00",
 | |
|             page_number=None,
 | |
|             url=None,
 | |
|             sent_from=['"Matthew Robinson" <mrobinson@unstructured.io>'],
 | |
|             sent_to=["mrobinson@unstructured.io"],
 | |
|             subject="Test Email",
 | |
|             filetype="application/vnd.ms-outlook",
 | |
|             parent_id=parent_id,
 | |
|             languages=["eng"],
 | |
|         ).to_dict()
 | |
|     )
 | |
| 
 | |
| 
 | |
| def test_partition_msg_from_filename_returns_uns_elements():
 | |
|     filename = example_doc_path("fake-email.msg")
 | |
|     elements = partition_msg(filename=filename)
 | |
|     assert isinstance(elements[0], NarrativeText)
 | |
| 
 | |
| 
 | |
| def test_partition_msg_from_filename_with_metadata_filename():
 | |
|     filename = example_doc_path("fake-email.msg")
 | |
|     elements = partition_msg(filename=filename, metadata_filename="test")
 | |
|     assert all(element.metadata.filename == "test" for element in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_msg_from_filename_with_text_content():
 | |
|     filename = example_doc_path("fake-email.msg")
 | |
| 
 | |
|     elements = partition_msg(filename=filename)
 | |
| 
 | |
|     assert str(elements[0]) == "This is a test email to use for unit tests."
 | |
|     assert elements[0].metadata.filename == "fake-email.msg"
 | |
|     assert elements[0].metadata.file_directory == example_doc_path("")
 | |
| 
 | |
| 
 | |
| def test_partition_msg_raises_with_missing_file():
 | |
|     filename = example_doc_path("doesnt-exist.msg")
 | |
|     with pytest.raises(FileNotFoundError):
 | |
|         partition_msg(filename=filename)
 | |
| 
 | |
| 
 | |
| def test_partition_msg_from_file():
 | |
|     filename = example_doc_path("fake-email.msg")
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = partition_msg(file=f)
 | |
|     assert elements == EXPECTED_MSG_OUTPUT
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename is None
 | |
| 
 | |
| 
 | |
| def test_partition_msg_from_file_with_metadata_filename():
 | |
|     filename = example_doc_path("fake-email.msg")
 | |
|     with open(filename, "rb") as f:
 | |
|         elements = partition_msg(file=f, metadata_filename="test")
 | |
|     assert elements == EXPECTED_MSG_OUTPUT
 | |
|     for element in elements:
 | |
|         assert element.metadata.filename == "test"
 | |
| 
 | |
| 
 | |
| def test_partition_msg_uses_file_path_when_both_are_specified():
 | |
|     elements = partition_msg(example_doc_path("fake-email.msg"), file=io.BytesIO(b"abcde"))
 | |
|     assert elements == EXPECTED_MSG_OUTPUT
 | |
| 
 | |
| 
 | |
| def test_partition_msg_raises_with_neither():
 | |
|     with pytest.raises(ValueError):
 | |
|         partition_msg()
 | |
| 
 | |
| 
 | |
| # -- attachments ---------------------------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| def test_partition_msg_can_process_attachments():
 | |
|     elements = partition_msg(
 | |
|         example_doc_path("fake-email-multiple-attachments.msg"), process_attachments=True
 | |
|     )
 | |
| 
 | |
|     assert all(e.metadata.filename == "fake-email-multiple-attachments.msg" for e in elements[:5])
 | |
|     assert all(e.metadata.filename == "unstructured_logo.png" for e in elements[5:7])
 | |
|     assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:343])
 | |
|     assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[343:])
 | |
|     assert [e.text for e in elements[:5]] == [
 | |
|         "Here are those documents.",
 | |
|         "--",
 | |
|         "Mallori Harrell",
 | |
|         "Unstructured Technologies",
 | |
|         "Data Scientist",
 | |
|     ]
 | |
|     assert [type(e).__name__ for e in elements][:10] == [
 | |
|         "NarrativeText",
 | |
|         "Text",
 | |
|         "Title",
 | |
|         "Title",
 | |
|         "Title",
 | |
|         "Image",
 | |
|         "Title",
 | |
|         "Text",
 | |
|         "Title",
 | |
|         "Title",
 | |
|     ]
 | |
|     assert [type(e).__name__ for e in elements][-10:] == [
 | |
|         "Title",
 | |
|         "ListItem",
 | |
|         "ListItem",
 | |
|         "ListItem",
 | |
|         "ListItem",
 | |
|         "ListItem",
 | |
|         "ListItem",
 | |
|         "ListItem",
 | |
|         "ListItem",
 | |
|         "ListItem",
 | |
|     ]
 | |
| 
 | |
| 
 | |
| def test_partition_msg_silently_skips_attachments_it_cannot_partition(request: FixtureRequest):
 | |
|     function_mock(
 | |
|         request, "unstructured.partition.auto.partition", side_effect=UnsupportedFileFormatError()
 | |
|     )
 | |
| 
 | |
|     elements = partition_msg(
 | |
|         example_doc_path("fake-email-multiple-attachments.msg"), process_attachments=True
 | |
|     )
 | |
| 
 | |
|     # -- no exception is raised --
 | |
|     assert elements == [
 | |
|         # -- the email body is partitioned --
 | |
|         NarrativeText("Here are those documents."),
 | |
|         Text("--"),
 | |
|         Title("Mallori Harrell"),
 | |
|         Title("Unstructured Technologies"),
 | |
|         Title("Data Scientist"),
 | |
|         # -- no elements appear for the attachment(s) --
 | |
|     ]
 | |
| 
 | |
| 
 | |
| # -- .metadata.filename --------------------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| def test_partition_msg_from_filename_gets_filename_metadata_from_file_path():
 | |
|     elements = partition_msg(example_doc_path("fake-email.msg"))
 | |
| 
 | |
|     assert all(e.metadata.filename == "fake-email.msg" for e in elements)
 | |
|     assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_msg_from_file_gets_filename_metadata_None():
 | |
|     with open(example_doc_path("fake-email.msg"), "rb") as f:
 | |
|         elements = partition_msg(file=f)
 | |
| 
 | |
|     assert all(e.metadata.filename is None for e in elements)
 | |
|     assert all(e.metadata.file_directory is None for e in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_msg_from_filename_prefers_metadata_filename():
 | |
|     elements = partition_msg(example_doc_path("fake-email.msg"), metadata_filename="a/b/c.msg")
 | |
| 
 | |
|     assert all(e.metadata.filename == "c.msg" for e in elements)
 | |
|     assert all(e.metadata.file_directory == "a/b" for e in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_msg_from_file_prefers_metadata_filename():
 | |
|     with open(example_doc_path("fake-email.msg"), "rb") as f:
 | |
|         elements = partition_msg(file=f, metadata_filename="d/e/f.msg")
 | |
| 
 | |
|     assert all(e.metadata.filename == "f.msg" for e in elements)
 | |
|     assert all(e.metadata.file_directory == "d/e" for e in elements)
 | |
| 
 | |
| 
 | |
| # -- .metadata.filetype --------------------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| def test_partition_msg_gets_the_MSG_mime_type_in_metadata_filetype():
 | |
|     MSG_MIME_TYPE = "application/vnd.ms-outlook"
 | |
|     elements = partition_msg(example_doc_path("fake-email.msg"))
 | |
|     assert all(e.metadata.filetype == MSG_MIME_TYPE for e in elements), (
 | |
|         f"Expected all elements to have '{MSG_MIME_TYPE}' as their filetype, but got:"
 | |
|         f" {repr(elements[0].metadata.filetype)}"
 | |
|     )
 | |
| 
 | |
| 
 | |
| # -- .metadata.last_modified ---------------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| def test_partition_msg_pulls_last_modified_from_message_sent_date():
 | |
|     elements = partition_msg(example_doc_path("fake-email.msg"))
 | |
|     assert all(e.metadata.last_modified == "2023-03-28T17:00:31+00:00" for e in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_msg_from_file_path_prefers_metadata_last_modified():
 | |
|     metadata_last_modified = "2020-07-05T09:24:28"
 | |
| 
 | |
|     elements = partition_msg(
 | |
|         example_doc_path("fake-email.msg"), metadata_last_modified=metadata_last_modified
 | |
|     )
 | |
| 
 | |
|     assert elements[0].metadata.last_modified == metadata_last_modified
 | |
| 
 | |
| 
 | |
| def test_partition_msg_from_file_prefers_metadata_last_modified():
 | |
|     metadata_last_modified = "2020-07-05T09:24:28"
 | |
| 
 | |
|     with open(example_doc_path("fake-email.msg"), "rb") as f:
 | |
|         elements = partition_msg(file=f, metadata_last_modified=metadata_last_modified)
 | |
| 
 | |
|     assert all(e.metadata.last_modified == metadata_last_modified for e in elements)
 | |
| 
 | |
| 
 | |
| # ------------------------------------------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| def test_partition_msg_with_json():
 | |
|     elements = partition_msg(example_doc_path("fake-email.msg"))
 | |
|     assert_round_trips_through_JSON(elements)
 | |
| 
 | |
| 
 | |
| def test_partition_msg_with_pgp_encrypted_message(caplog: LogCaptureFixture):
 | |
|     elements = partition_msg(example_doc_path("fake-encrypted.msg"))
 | |
| 
 | |
|     assert elements == []
 | |
|     assert "WARNING" in caplog.text
 | |
|     assert "Encrypted email detected" in caplog.text
 | |
| 
 | |
| 
 | |
| def test_add_chunking_strategy_by_title_on_partition_msg():
 | |
|     filename = example_doc_path("fake-email.msg")
 | |
| 
 | |
|     elements = partition_msg(filename=filename)
 | |
|     chunk_elements = partition_msg(filename, chunking_strategy="by_title")
 | |
|     chunks = chunk_by_title(elements)
 | |
| 
 | |
|     assert chunk_elements != elements
 | |
|     assert chunk_elements == chunks
 | |
| 
 | |
| 
 | |
| # -- language behaviors --------------------------------------------------------------------------
 | |
| 
 | |
| 
 | |
| def test_partition_msg_element_metadata_has_languages():
 | |
|     filename = "example-docs/fake-email.msg"
 | |
|     elements = partition_msg(filename=filename)
 | |
|     assert elements[0].metadata.languages == ["eng"]
 | |
| 
 | |
| 
 | |
| def test_partition_msg_respects_languages_arg():
 | |
|     filename = "example-docs/fake-email.msg"
 | |
|     elements = partition_msg(filename=filename, languages=["deu"])
 | |
|     assert all(element.metadata.languages == ["deu"] for element in elements)
 | |
| 
 | |
| 
 | |
| def test_partition_msg_raises_TypeError_for_invalid_languages():
 | |
|     with pytest.raises(TypeError):
 | |
|         filename = "example-docs/fake-email.msg"
 | |
|         partition_msg(filename=filename, languages="eng")
 | |
| 
 | |
| 
 | |
| # ================================================================================================
 | |
| # ISOLATED UNIT TESTS
 | |
| # ================================================================================================
 | |
| # These test components used by `partition_msg()` in isolation such that all edge cases can be
 | |
| # exercised.
 | |
| # ================================================================================================
 | |
| 
 | |
| 
 | |
| class DescribeMsgPartitionerOptions:
 | |
|     """Unit-test suite for `unstructured.partition.msg.MsgPartitionerOptions` objects."""
 | |
| 
 | |
|     # -- .extra_msg_metadata ---------------------
 | |
| 
 | |
|     def it_provides_email_specific_metadata_to_add_to_each_element(self, opts_args: dict[str, Any]):
 | |
|         opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg")
 | |
|         opts = MsgPartitionerOptions(**opts_args)
 | |
| 
 | |
|         m = opts.extra_msg_metadata
 | |
|         assert m.bcc_recipient == ["hello@unstructured.io"]
 | |
|         assert m.cc_recipient == ["steve@unstructured.io"]
 | |
|         assert m.email_message_id == "14DDEF33-2BA7-4CDD-A4D8-E7C5873B37F2@gmail.com"
 | |
|         assert m.sent_from == ['"John" <johnjennings702@gmail.com>']
 | |
|         assert m.sent_to == [
 | |
|             "john-ctr@unstructured.io",
 | |
|             "steve@unstructured.io",
 | |
|             "hello@unstructured.io",
 | |
|         ]
 | |
|         assert m.subject == "Fake email with cc and bcc recipients"
 | |
| 
 | |
|     # -- .is_encrypted ---------------------------
 | |
| 
 | |
|     @pytest.mark.parametrize(
 | |
|         ("file_name", "expected_value"), [("fake-encrypted.msg", True), ("fake-email.msg", False)]
 | |
|     )
 | |
|     def it_knows_when_the_msg_is_encrypted(
 | |
|         self, file_name: str, expected_value: bool, opts_args: dict[str, Any]
 | |
|     ):
 | |
|         opts_args["file_path"] = example_doc_path(file_name)
 | |
|         opts = MsgPartitionerOptions(**opts_args)
 | |
| 
 | |
|         assert opts.is_encrypted is expected_value
 | |
| 
 | |
|     # -- .metadata_file_path ---------------------
 | |
| 
 | |
|     def it_uses_the_metadata_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
 | |
|         opts_args["file_path"] = "x/y/z.msg"
 | |
|         opts_args["metadata_file_path"] = "a/b/c.msg"
 | |
|         opts = MsgPartitionerOptions(**opts_args)
 | |
| 
 | |
|         assert opts.metadata_file_path == "a/b/c.msg"
 | |
| 
 | |
|     def and_it_falls_back_to_the_MSG_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
 | |
|         file_path = example_doc_path("fake-email.msg")
 | |
|         opts_args["file_path"] = file_path
 | |
|         opts = MsgPartitionerOptions(**opts_args)
 | |
| 
 | |
|         assert opts.metadata_file_path == file_path
 | |
| 
 | |
|     def but_it_returns_None_when_neither_path_is_available(self, opts_args: dict[str, Any]):
 | |
|         opts = MsgPartitionerOptions(**opts_args)
 | |
| 
 | |
|         assert opts.metadata_file_path is None
 | |
| 
 | |
|     # -- .metadata_last_modified -----------------
 | |
| 
 | |
|     def it_uses_metadata_last_modified_when_provided_by_the_caller(self, opts_args: dict[str, Any]):
 | |
|         metadata_last_modified = "2024-03-05T17:02:53"
 | |
|         opts_args["metadata_last_modified"] = metadata_last_modified
 | |
|         opts_args["file_path"] = example_doc_path("fake-email.msg")
 | |
|         opts = MsgPartitionerOptions(**opts_args)
 | |
| 
 | |
|         assert opts.metadata_last_modified == metadata_last_modified
 | |
| 
 | |
|     def and_it_uses_the_message_Date_header_when_metadata_last_modified_is_not_provided(
 | |
|         self, opts_args: dict[str, Any]
 | |
|     ):
 | |
|         opts_args["file_path"] = example_doc_path("fake-email.msg")
 | |
|         opts = MsgPartitionerOptions(**opts_args)
 | |
| 
 | |
|         assert opts.metadata_last_modified == "2023-03-28T17:00:31+00:00"
 | |
| 
 | |
|     @pytest.mark.parametrize("filesystem_last_modified", ["2024-06-03T20:12:53", None])
 | |
|     def and_it_uses_the_last_modified_date_from_the_source_file_when_the_message_has_no_sent_date(
 | |
|         self,
 | |
|         opts_args: dict[str, Any],
 | |
|         filesystem_last_modified: str | None,
 | |
|         Message_sent_date_: Mock,
 | |
|         _last_modified_prop_: Mock,
 | |
|     ):
 | |
|         Message_sent_date_.return_value = None
 | |
|         _last_modified_prop_.return_value = filesystem_last_modified
 | |
|         opts_args["file_path"] = example_doc_path("fake-email.msg")
 | |
|         opts = MsgPartitionerOptions(**opts_args)
 | |
| 
 | |
|         assert opts.metadata_last_modified == filesystem_last_modified
 | |
| 
 | |
|     # -- .msg ------------------------------------
 | |
| 
 | |
|     def it_loads_the_msg_document_from_a_file_path_when_provided(self, opts_args: dict[str, Any]):
 | |
|         opts_args["file_path"] = example_doc_path("fake-email.msg")
 | |
|         opts = MsgPartitionerOptions(**opts_args)
 | |
| 
 | |
|         assert isinstance(opts.msg, Message)
 | |
| 
 | |
|     def and_it_loads_the_msg_document_from_a_file_like_object_when_provided(
 | |
|         self, opts_args: dict[str, Any]
 | |
|     ):
 | |
|         with open(example_doc_path("fake-email.msg"), "rb") as f:
 | |
|             opts_args["file"] = io.BytesIO(f.read())
 | |
|         opts = MsgPartitionerOptions(**opts_args)
 | |
| 
 | |
|         assert isinstance(opts.msg, Message)
 | |
| 
 | |
|     def but_it_raises_when_neither_is_provided(self, opts_args: dict[str, Any]):
 | |
|         with pytest.raises(ValueError, match="one of `file` or `filename` arguments must be prov"):
 | |
|             MsgPartitionerOptions(**opts_args).msg
 | |
| 
 | |
|     # -- .partition_attachments ------------------
 | |
| 
 | |
|     @pytest.mark.parametrize("partition_attachments", [True, False])
 | |
|     def it_knows_whether_attachments_should_also_be_partitioned(
 | |
|         self, partition_attachments: bool, opts_args: dict[str, Any]
 | |
|     ):
 | |
|         opts_args["file_path"] = example_doc_path("fake-email.msg")
 | |
|         opts_args["partition_attachments"] = partition_attachments
 | |
|         opts = MsgPartitionerOptions(**opts_args)
 | |
| 
 | |
|         assert opts.partition_attachments is partition_attachments
 | |
| 
 | |
|     # -- .partitioning_kwargs --------------------
 | |
| 
 | |
|     def it_provides_access_to_pass_through_kwargs_collected_by_the_partitioner_function(
 | |
|         self, opts_args: dict[str, Any]
 | |
|     ):
 | |
|         opts_args["kwargs"] = {"foo": 42, "bar": "baz"}
 | |
|         opts = MsgPartitionerOptions(**opts_args)
 | |
| 
 | |
|         assert opts.partitioning_kwargs == {"foo": 42, "bar": "baz"}
 | |
| 
 | |
|     # -- fixtures --------------------------------------------------------------------------------
 | |
| 
 | |
|     @pytest.fixture
 | |
|     def _last_modified_prop_(self, request: FixtureRequest):
 | |
|         return property_mock(request, MsgPartitionerOptions, "_last_modified")
 | |
| 
 | |
|     @pytest.fixture
 | |
|     def Message_sent_date_(self, request: FixtureRequest):
 | |
|         return property_mock(request, Message, "sent_date")
 | |
| 
 | |
|     @pytest.fixture
 | |
|     def opts_args(self) -> dict[str, Any]:
 | |
|         """All default arguments for `MsgPartitionerOptions`.
 | |
| 
 | |
|         Individual argument values can be changed to suit each test. Makes construction of opts more
 | |
|         compact for testing purposes.
 | |
|         """
 | |
|         return {
 | |
|             "file": None,
 | |
|             "file_path": None,
 | |
|             "metadata_file_path": None,
 | |
|             "metadata_last_modified": None,
 | |
|             "partition_attachments": False,
 | |
|             "kwargs": {},
 | |
|         }
 |