diff --git a/CHANGELOG.md b/CHANGELOG.md index ab2185cf1..3e08468f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.14-dev10 +## 0.15.14-dev11 ### Enhancements @@ -16,6 +16,7 @@ * **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners. * **Remove double-decoration for HTML, EPUB, MD, ORG, RST, and RTF partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (HTML in this case); remove decoration from delegating partitioners. * **Remove obsolete min_partition/max_partition args from TXT and EML.** The legacy `min_partition` and `max_partition` parameters were an initial rough implementation of chunking but now interfere with chunking and are unused. Remove those parameters from `partition_text()` and `partition_email()`. +* **Remove double-decoration on EML and MSG.** Refactor these partitioners to rely on the new `@apply_metadata()` decorator operating on partitioners they delegate to (TXT, HTML, and all others for attachments) and remove direct decoration from EML and MSG. ## 0.15.13 diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py index 183efb8c6..ab30b007e 100644 --- a/test_unstructured/metrics/test_element_type.py +++ b/test_unstructured/metrics/test_element_type.py @@ -19,8 +19,8 @@ from unstructured.staging.base import elements_to_json "fake-email.txt", { ("NarrativeText", None): 1, - ("Title", None): 1, - ("ListItem", None): 2, + ("Title", 0): 1, + ("ListItem", 1): 2, }, ), ( @@ -49,8 +49,8 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in ( "fake-email.txt", { - ("Title", None): 1, - ("ListItem", None): 2, + ("Title", 0): 1, + ("ListItem", 1): 2, ("NarrativeText", None): 2, }, (0.8, 0.8, 0.80), diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index 4ccb84ef1..2512e5c1f 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -471,6 +471,18 @@ def test_partition_email_from_filename_has_metadata(): assert element.metadata.filename == "fake-email.eml" +# -- .metadata.filetype -------------------------------------------------------------------------- + + +def test_partition_email_gets_the_EMAIL_mime_type_in_metadata_filetype(): + EMAIL_MIME_TYPE = "message/rfc822" + elements = partition_email(example_doc_path("fake-email.eml")) + assert all(e.metadata.filetype == EMAIL_MIME_TYPE for e in elements), ( + f"Expected all elements to have '{EMAIL_MIME_TYPE}' as their filetype, but got:" + f" {repr(elements[0].metadata.filetype)}" + ) + + # -- .metadata.last_modified --------------------------------------------------------------------- diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index 4b19c761b..02fc11b40 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -55,7 +55,6 @@ def test_partition_msg_from_filename(): languages=["eng"], ).to_dict() ) - assert all(e.metadata.filename == "fake-email.msg" for e in elements) def test_partition_msg_from_filename_returns_uns_elements(): @@ -156,6 +155,51 @@ def test_partition_msg_can_process_attachments(): ] +# -- .metadata.filename -------------------------------------------------------------------------- + + +def test_partition_msg_from_filename_gets_filename_metadata_from_file_path(): + elements = partition_msg(example_doc_path("fake-email.msg")) + + assert all(e.metadata.filename == "fake-email.msg" for e in elements) + assert all(e.metadata.file_directory == example_doc_path("") for e in elements) + + +def test_partition_msg_from_file_gets_filename_metadata_None(): + with open(example_doc_path("fake-email.msg"), "rb") as f: + elements = partition_msg(file=f) + + assert all(e.metadata.filename is None for e in elements) + assert all(e.metadata.file_directory is None for e in elements) + + +def test_partition_msg_from_filename_prefers_metadata_filename(): + elements = partition_msg(example_doc_path("fake-email.msg"), metadata_filename="a/b/c.msg") + + assert all(e.metadata.filename == "c.msg" for e in elements) + assert all(e.metadata.file_directory == "a/b" for e in elements) + + +def test_partition_msg_from_file_prefers_metadata_filename(): + with open(example_doc_path("fake-email.msg"), "rb") as f: + elements = partition_msg(file=f, metadata_filename="d/e/f.msg") + + assert all(e.metadata.filename == "f.msg" for e in elements) + assert all(e.metadata.file_directory == "d/e" for e in elements) + + +# -- .metadata.filetype -------------------------------------------------------------------------- + + +def test_partition_msg_gets_the_MSG_mime_type_in_metadata_filetype(): + MSG_MIME_TYPE = "application/vnd.ms-outlook" + elements = partition_msg(example_doc_path("fake-email.msg")) + assert all(e.metadata.filetype == MSG_MIME_TYPE for e in elements), ( + f"Expected all elements to have '{MSG_MIME_TYPE}' as their filetype, but got:" + f" {repr(elements[0].metadata.filetype)}" + ) + + # -- .metadata.last_modified --------------------------------------------------------------------- @@ -242,6 +286,24 @@ def test_partition_msg_raises_TypeError_for_invalid_languages(): class DescribeMsgPartitionerOptions: """Unit-test suite for `unstructured.partition.msg.MsgPartitionerOptions` objects.""" + # -- .extra_msg_metadata --------------------- + + def it_provides_email_specific_metadata_to_add_to_each_element(self, opts_args: dict[str, Any]): + opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg") + opts = MsgPartitionerOptions(**opts_args) + + m = opts.extra_msg_metadata + assert m.bcc_recipient == ["hello@unstructured.io"] + assert m.cc_recipient == ["steve@unstructured.io"] + assert m.email_message_id == "14DDEF33-2BA7-4CDD-A4D8-E7C5873B37F2@gmail.com" + assert m.sent_from == ['"John" '] + assert m.sent_to == [ + "john-ctr@unstructured.io", + "steve@unstructured.io", + "hello@unstructured.io", + ] + assert m.subject == "Fake email with cc and bcc recipients" + # -- .is_encrypted --------------------------- @pytest.mark.parametrize( @@ -257,34 +319,58 @@ class DescribeMsgPartitionerOptions: # -- .metadata_file_path --------------------- - def it_uses_the_user_provided_metadata_file_path_when_provided(self, opts_args: dict[str, Any]): + def it_uses_the_metadata_file_path_arg_when_provided(self, opts_args: dict[str, Any]): opts_args["file_path"] = "x/y/z.msg" opts_args["metadata_file_path"] = "a/b/c.msg" opts = MsgPartitionerOptions(**opts_args) assert opts.metadata_file_path == "a/b/c.msg" - @pytest.mark.parametrize("file_path", ["u/v/w.msg", None]) - def and_it_falls_back_to_the_document_file_path_otherwise_including_when_the_file_path_is_None( - self, file_path: str | None, opts_args: dict[str, Any] - ): + def and_it_falls_back_to_the_MSG_file_path_arg_when_provided(self, opts_args: dict[str, Any]): + file_path = example_doc_path("fake-email.msg") opts_args["file_path"] = file_path - opts_args["metadata_file_path"] = None opts = MsgPartitionerOptions(**opts_args) assert opts.metadata_file_path == file_path + def but_it_returns_None_when_neither_path_is_available(self, opts_args: dict[str, Any]): + opts = MsgPartitionerOptions(**opts_args) + + assert opts.metadata_file_path is None + # -- .metadata_last_modified ----------------- - @pytest.mark.parametrize("metadata_last_modified", ["2024-03-05T17:02:53", None]) - def it_knows_the_metadata_last_modified_date_provided_by_the_caller( - self, metadata_last_modified: str | None, opts_args: dict[str, Any] - ): + def it_uses_metadata_last_modified_when_provided_by_the_caller(self, opts_args: dict[str, Any]): + metadata_last_modified = "2024-03-05T17:02:53" opts_args["metadata_last_modified"] = metadata_last_modified + opts_args["file_path"] = example_doc_path("fake-email.msg") opts = MsgPartitionerOptions(**opts_args) assert opts.metadata_last_modified == metadata_last_modified + def and_it_uses_the_message_Date_header_when_metadata_last_modified_is_not_provided( + self, opts_args: dict[str, Any] + ): + opts_args["file_path"] = example_doc_path("fake-email.msg") + opts = MsgPartitionerOptions(**opts_args) + + assert opts.metadata_last_modified == "2023-03-28T17:00:31+00:00" + + @pytest.mark.parametrize("filesystem_last_modified", ["2024-06-03T20:12:53", None]) + def and_it_uses_the_last_modified_date_from_the_source_file_when_the_message_has_no_sent_date( + self, + opts_args: dict[str, Any], + filesystem_last_modified: str | None, + Message_sent_date_: Mock, + _last_modified_prop_: Mock, + ): + Message_sent_date_.return_value = None + _last_modified_prop_.return_value = filesystem_last_modified + opts_args["file_path"] = example_doc_path("fake-email.msg") + opts = MsgPartitionerOptions(**opts_args) + + assert opts.metadata_last_modified == filesystem_last_modified + # -- .msg ------------------------------------ def it_loads_the_msg_document_from_a_file_path_when_provided(self, opts_args: dict[str, Any]): @@ -306,88 +392,6 @@ class DescribeMsgPartitionerOptions: with pytest.raises(ValueError, match="one of `file` or `filename` arguments must be prov"): MsgPartitionerOptions(**opts_args).msg - # -- .msg_metadata --------------------------- - - def it_provides_a_unique_metadata_instance_for_each_element(self, opts_args: dict[str, Any]): - opts_args["file_path"] = example_doc_path("fake-email.msg") - opts = MsgPartitionerOptions(**opts_args) - - assert opts.msg_metadata is not opts.msg_metadata - - # -- .metadata.filename ---------------------- - - def it_uses_the_metadata_file_path_value_for_msg_metadata( - self, opts_args: dict[str, Any], metadata_file_path_prop_: Mock - ): - metadata_file_path_prop_.return_value = "a/b/c.msg" - opts_args["file_path"] = example_doc_path("fake-email.msg") - opts = MsgPartitionerOptions(**opts_args) - - assert opts.msg_metadata.filename == "c.msg" - assert opts.msg_metadata.file_directory == "a/b" - - # -- .metadata.last_modified ----------------- - - def it_uses_metadata_last_modified_when_provided_by_caller(self, opts_args: dict[str, Any]): - opts_args["file_path"] = example_doc_path("fake-email.msg") - opts_args["metadata_last_modified"] = "2024-06-03T20:07:31+00:00" - opts = MsgPartitionerOptions(**opts_args) - - assert opts.msg_metadata.last_modified == "2024-06-03T20:07:31+00:00" - - def and_it_uses_the_sent_date_of_the_email_when_metadata_last_modified_is_not_provided( - self, opts_args: dict[str, Any] - ): - opts_args["file_path"] = example_doc_path("fake-email.msg") - opts = MsgPartitionerOptions(**opts_args) - - assert opts.msg_metadata.last_modified == "2023-03-28T17:00:31+00:00" - - @pytest.mark.parametrize("file_last_modified", ["2024-06-03T20:12:53", None]) - def and_it_uses_the_last_modified_date_from_the_source_file_when_the_message_has_no_sent_date( - self, - opts_args: dict[str, Any], - file_last_modified: str | None, - Message_sent_date_: Mock, - _last_modified_prop_: Mock, - ): - Message_sent_date_.return_value = None - _last_modified_prop_.return_value = file_last_modified - opts_args["file_path"] = example_doc_path("fake-email.msg") - opts = MsgPartitionerOptions(**opts_args) - - assert opts.msg_metadata.last_modified == file_last_modified - - # -- .metadata (email-specific) -------------- - - def it_adds_email_specific_fields_to_the_msg_element_metadata(self, opts_args: dict[str, Any]): - opts_args["file_path"] = example_doc_path("fake-email.msg") - opts = MsgPartitionerOptions(**opts_args) - - assert opts.msg_metadata.sent_from == ['"Matthew Robinson" '] - assert opts.msg_metadata.sent_to == ["mrobinson@unstructured.io"] - assert opts.msg_metadata.subject == "Test Email" - - def it_captures_cc_and_bcc_element_metadata(self, opts_args: dict[str, Any]): - opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg") - opts = MsgPartitionerOptions(**opts_args) - - assert opts.msg_metadata.cc_recipient == ["steve@unstructured.io"] - assert opts.msg_metadata.bcc_recipient == ["hello@unstructured.io"] - assert opts.msg_metadata.sent_to == [ - "john-ctr@unstructured.io", - "steve@unstructured.io", - "hello@unstructured.io", - ] - - def it_captures_email_message_id_element_metadata(self, opts_args: dict[str, Any]): - opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg") - opts = MsgPartitionerOptions(**opts_args) - - assert ( - opts.msg_metadata.email_message_id == "14DDEF33-2BA7-4CDD-A4D8-E7C5873B37F2@gmail.com" - ) - # -- .partition_attachments ------------------ @pytest.mark.parametrize("partition_attachments", [True, False]) @@ -400,6 +404,16 @@ class DescribeMsgPartitionerOptions: assert opts.partition_attachments is partition_attachments + # -- .partitioning_kwargs -------------------- + + def it_provides_access_to_pass_through_kwargs_collected_by_the_partitioner_function( + self, opts_args: dict[str, Any] + ): + opts_args["kwargs"] = {"foo": 42, "bar": "baz"} + opts = MsgPartitionerOptions(**opts_args) + + assert opts.partitioning_kwargs == {"foo": 42, "bar": "baz"} + # -- fixtures -------------------------------------------------------------------------------- @pytest.fixture @@ -410,10 +424,6 @@ class DescribeMsgPartitionerOptions: def Message_sent_date_(self, request: FixtureRequest): return property_mock(request, Message, "sent_date") - @pytest.fixture - def metadata_file_path_prop_(self, request: FixtureRequest): - return property_mock(request, MsgPartitionerOptions, "metadata_file_path") - @pytest.fixture def opts_args(self) -> dict[str, Any]: """All default arguments for `MsgPartitionerOptions`. @@ -427,4 +437,5 @@ class DescribeMsgPartitionerOptions: "metadata_file_path": None, "metadata_last_modified": None, "partition_attachments": False, + "kwargs": {}, } diff --git a/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json b/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json index 62f8a3ebe..4304e214b 100644 --- a/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json +++ b/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json @@ -1,6 +1,6 @@ [ { - "element_id": "df08d0aeb11a34e75766d2d2008d73a6", + "element_id": "e482ff3e97d6318a4c0e00aea0adf544", "metadata": { "data_source": { "date_created": "2023-07-15T15:36:08", diff --git a/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json b/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json index 56d73c1da..bde3500e7 100644 --- a/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json +++ b/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json @@ -1,6 +1,6 @@ [ { - "element_id": "e40af23706b4096145f1e4b007719aa5", + "element_id": "4a69e8fcddd4b6eff8488a34ba16b0dd", "metadata": { "data_source": { "date_created": "2023-07-25T01:26:22", diff --git a/test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json b/test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json index 7596c5df2..0bbcb8fd8 100644 --- a/test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json +++ b/test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json @@ -1,6 +1,6 @@ [ { - "element_id": "8488a63070421b09a14ad6078c2cec2a", + "element_id": "4df3eedf1b6f98566fc40a132b48205f", "metadata": { "data_source": { "date_created": "2023-07-10T03:39:04", diff --git a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json index 5c1f06ae6..afbddc098 100644 --- a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json +++ b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json @@ -1,9 +1,13 @@ [ { "type": "NarrativeText", - "element_id": "4196fe41da19e8657761ecffcafd3d2f", + "element_id": "191e99ff4061730e85d9300183b4ccbe", "text": "Jane. This is a test of sending you an email from Salesforce! _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/", "metadata": { + "languages": [ + "eng" + ], + "filetype": "message/rfc822", "email_message_id": "KhIK4000000000000000000000000000000000000000000000RZP1T400CmuP1P5wTm2m679gi-mnIg@sfdc.net", "sent_from": [ "devops+salesforce-connector@unstructured.io" @@ -12,10 +16,6 @@ "jane_gray@uoa.edu" ], "subject": "Test of email 1", - "languages": [ - "eng" - ], - "filetype": "message/rfc822", "data_source": { "url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU", "version": "1694691603.0", diff --git a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json index cc1ab18fd..9fa968280 100644 --- a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json +++ b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json @@ -1,9 +1,13 @@ [ { "type": "NarrativeText", - "element_id": "6f168cd430b41fc0d66a3691ef3caa0f", + "element_id": "f7d72e773a4c72747c88d8ea6e5d012a", "text": "Hey Sean. Testing email parsing here. Type: email Just testing the email system _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/", "metadata": { + "languages": [ + "eng" + ], + "filetype": "message/rfc822", "email_message_id": "CuWky000000000000000000000000000000000000000000000RZP1VO00MaLK8OmEQm2Bw-c3ek6uNg@sfdc.net", "sent_from": [ "devops+salesforce-connector@unstructured.io" @@ -12,10 +16,6 @@ "sean@edge.com" ], "subject": "Test of Salesforce 2", - "languages": [ - "eng" - ], - "filetype": "message/rfc822", "data_source": { "url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU", "version": "1694691603.0", diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 5f5caa499..f09adbe12 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.14-dev10" # pragma: no cover +__version__ = "0.15.14-dev11" # pragma: no cover diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 0a7b105db..787387c5e 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -1,6 +1,5 @@ from __future__ import annotations -import copy import datetime import email import os @@ -12,7 +11,6 @@ from functools import partial from tempfile import TemporaryDirectory from typing import IO, Any, Callable, Final, Type, cast -from unstructured.chunking import add_chunking_strategy from unstructured.cleaners.core import clean_extra_whitespace, replace_mime_encodings from unstructured.cleaners.extract import ( extract_datetimetz, @@ -27,7 +25,6 @@ from unstructured.documents.elements import ( NarrativeText, Text, Title, - process_metadata, ) from unstructured.documents.email_elements import ( MetaData, @@ -42,12 +39,10 @@ from unstructured.file_utils.encoding import ( read_txt_file, validate_encoding, ) -from unstructured.file_utils.filetype import add_metadata_with_filetype from unstructured.file_utils.model import FileType from unstructured.logger import logger from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE from unstructured.partition.common.common import convert_to_bytes, exactly_one -from unstructured.partition.common.lang import apply_lang_metadata from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.html import partition_html from unstructured.partition.text import partition_text @@ -56,38 +51,36 @@ VALID_CONTENT_SOURCES: Final[list[str]] = ["text/html", "text/plain"] DETECTION_ORIGIN: str = "email" -@process_metadata() -@add_metadata_with_filetype(FileType.EML) -@add_chunking_strategy def partition_email( filename: str | None = None, + *, file: IO[bytes] | None = None, + encoding: str | None = None, text: str | None = None, content_source: str = "text/html", - encoding: str | None = None, include_headers: bool = False, metadata_filename: str | None = None, metadata_last_modified: str | None = None, process_attachments: bool = False, attachment_partitioner: Callable[..., list[Element]] | None = None, - languages: list[str] | None = ["auto"], - detect_language_per_element: bool = False, **kwargs: Any, ) -> list[Element]: """Partitions an .eml documents into its constituent elements. + Parameters ---------- filename A string defining the target filename path. file A file-like object using "r" mode --> open(filename, "r"). + encoding + The encoding method used to decode the input bytes when drawn from `filename` or `file`. + Defaults to "utf-8". text The string representation of the .eml document. content_source default: "text/html" other: "text/plain" - encoding - The encoding method used to decode the text input. If None, utf-8 will be used. metadata_filename The filename to use for the metadata. metadata_last_modified @@ -97,13 +90,6 @@ def partition_email( processing the content of the email itself. attachment_partitioner The partitioning function to use to process attachments. - languages - User defined value for `metadata.languages` if provided. Otherwise language is detected - using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be - in either language. - Additional Parameters: - detect_language_per_element - Detect language per element instead of at the document level. """ if content_source not in VALID_CONTENT_SOURCES: raise ValueError( @@ -211,8 +197,9 @@ def partition_email( elements = partition_html( text=content, metadata_filename=metadata_filename, - languages=[""], + metadata_file_type=FileType.EML, detection_origin="email", + **kwargs, ) for element in elements: if isinstance(element, Text): @@ -244,8 +231,9 @@ def partition_email( elements = partition_text( text=content, encoding=encoding, - languages=[""], + metadata_file_type=FileType.EML, detection_origin="email", + **kwargs, ) else: raise ValueError( @@ -274,7 +262,7 @@ def partition_email( last_modification_date=last_modified, ) for element in all_elements: - element.metadata = copy.deepcopy(metadata) + element.metadata.update(metadata) if process_attachments: with TemporaryDirectory() as tmpdir: @@ -295,15 +283,7 @@ def partition_email( element.metadata.attached_to_filename = metadata_filename or filename all_elements.append(element) - elements = list( - apply_lang_metadata( - elements=all_elements, - languages=languages, - detect_language_per_element=detect_language_per_element, - ), - ) - - return elements + return all_elements # ================================================================================================ diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index 0641c662b..5ef4c9e09 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -1,6 +1,5 @@ from __future__ import annotations -import copy import os import re import tempfile @@ -9,21 +8,15 @@ from typing import IO, Any, Iterator, Optional from oxmsg import Message from oxmsg.attachment import Attachment -from unstructured.chunking import add_chunking_strategy -from unstructured.documents.elements import Element, ElementMetadata, process_metadata -from unstructured.file_utils.filetype import add_metadata_with_filetype +from unstructured.documents.elements import Element, ElementMetadata from unstructured.file_utils.model import FileType from unstructured.logger import logger -from unstructured.partition.common.lang import apply_lang_metadata from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.html import partition_html from unstructured.partition.text import partition_text from unstructured.utils import is_temp_file_path, lazyproperty -@process_metadata() -@add_metadata_with_filetype(FileType.MSG) -@add_chunking_strategy def partition_msg( filename: Optional[str] = None, *, @@ -55,15 +48,10 @@ def partition_msg( metadata_file_path=metadata_filename, metadata_last_modified=metadata_last_modified, partition_attachments=process_attachments, + kwargs=kwargs, ) - return list( - apply_lang_metadata( - elements=_MsgPartitioner.iter_message_elements(opts), - languages=kwargs.get("languages", ["auto"]), - detect_language_per_element=kwargs.get("detect_language_per_element", False), - ) - ) + return list(_MsgPartitioner.iter_message_elements(opts)) class MsgPartitionerOptions: @@ -77,12 +65,48 @@ class MsgPartitionerOptions: metadata_file_path: str | None, metadata_last_modified: str | None, partition_attachments: bool, + kwargs: dict[str, Any], ): self._file = file self._file_path = file_path self._metadata_file_path = metadata_file_path self._metadata_last_modified = metadata_last_modified self._partition_attachments = partition_attachments + self._kwargs = kwargs + + @lazyproperty + def extra_msg_metadata(self) -> ElementMetadata: + """ElementMetadata suitable for use on an element formed from message content. + + These are only the metadata fields specific to email messages. The remaining metadata + fields produced by the delegate partitioner are used as produced. + + None of these metadata fields change based on the element, so we just compute it once. + """ + msg = self.msg + + sent_from = [s.strip() for s in sender.split(",")] if (sender := msg.sender) else None + sent_to = [r.email_address for r in msg.recipients] or None + bcc_recipient = ( + [c.strip() for c in bcc.split(",")] if (bcc := msg.message_headers.get("Bcc")) else None + ) + cc_recipient = ( + [c.strip() for c in cc.split(",")] if (cc := msg.message_headers.get("Cc")) else None + ) + if email_message_id := msg.message_headers.get("Message-Id"): + email_message_id = re.sub(r"^<|>$", "", email_message_id) # Strip angle brackets + + element_metadata = ElementMetadata( + bcc_recipient=bcc_recipient, + cc_recipient=cc_recipient, + email_message_id=email_message_id, + sent_from=sent_from, + sent_to=sent_to, + subject=msg.subject or None, + ) + element_metadata.detection_origin = "msg" + + return element_metadata @lazyproperty def is_encrypted(self) -> bool: @@ -108,22 +132,14 @@ class MsgPartitionerOptions: @lazyproperty def metadata_last_modified(self) -> str | None: """Caller override for `.metadata.last_modified` to be applied to all elements.""" - return self._metadata_last_modified + email_date = sent_date.isoformat() if (sent_date := self.msg.sent_date) else None + return self._metadata_last_modified or email_date or self._last_modified @lazyproperty def msg(self) -> Message: """The `oxmsg.Message` object loaded from file or filename.""" return Message.load(self._msg_file) - @property - def msg_metadata(self) -> ElementMetadata: - """ElementMetadata suitable for use on an element formed from message content. - - A distinct instance is returned on each reference such that downstream changes to the - metadata of one element is not also reflected in another element. - """ - return copy.copy(self._msg_metadata) - @lazyproperty def partition_attachments(self) -> bool: """True when message attachments should also be partitioned.""" @@ -131,22 +147,20 @@ class MsgPartitionerOptions: @lazyproperty def partitioning_kwargs(self) -> dict[str, Any]: - """Partitioning keyword-arguments to be passed along to attachment partitioner.""" - # TODO: no good reason we can't accept and pass along any file-type specific kwargs - # the caller might want to send along. - return {} + """The "extra" keyword arguments received by `partition_msg()`. + + These are passed along to delegate partitioners which extract keyword args like + `chunking_strategy` etc. in their decorators to control metadata behaviors, etc. + """ + return self._kwargs @lazyproperty def _last_modified(self) -> str | None: """The best last-modified date available from source-file, None if not available.""" - if self._file_path: - return ( - None - if is_temp_file_path(self._file_path) - else get_last_modified_date(self._file_path) - ) + if not self._file_path or is_temp_file_path(self._file_path): + return None - return None + return get_last_modified_date(self._file_path) @lazyproperty def _msg_file(self) -> str | IO[bytes]: @@ -159,41 +173,6 @@ class MsgPartitionerOptions: raise ValueError("one of `file` or `filename` arguments must be provided") - @property - def _msg_metadata(self) -> ElementMetadata: - """ElementMetadata "template" for elements of this message. - - None of these metadata fields change based on the element, so compute it once here and then - just make a separate copy for each element. - """ - msg = self.msg - - email_date = sent_date.isoformat() if (sent_date := msg.sent_date) else None - sent_from = [s.strip() for s in sender.split(",")] if (sender := msg.sender) else None - sent_to = [r.email_address for r in msg.recipients] or None - bcc_recipient = ( - [c.strip() for c in bcc.split(",")] if (bcc := msg.message_headers.get("Bcc")) else None - ) - cc_recipient = ( - [c.strip() for c in cc.split(",")] if (cc := msg.message_headers.get("Cc")) else None - ) - if email_message_id := msg.message_headers.get("Message-Id"): - email_message_id = re.sub(r"^<|>$", "", email_message_id) # Strip angle brackets - - element_metadata = ElementMetadata( - filename=self.metadata_file_path, - last_modified=self._metadata_last_modified or email_date or self._last_modified, - sent_from=sent_from, - sent_to=sent_to, - subject=msg.subject or None, - bcc_recipient=bcc_recipient, - cc_recipient=cc_recipient, - email_message_id=email_message_id, - ) - element_metadata.detection_origin = "msg" - - return element_metadata - class _MsgPartitioner: """Partitions Outlook email message (MSG) files.""" @@ -230,15 +209,28 @@ class _MsgPartitioner: msg = self._opts.msg if html_body := msg.html_body: - elements = partition_html(text=html_body, languages=[""]) + elements = partition_html( + text=html_body, + metadata_filename=self._opts.metadata_file_path, + metadata_file_type=FileType.MSG, + metadata_last_modified=self._opts.metadata_last_modified, + **self._opts.partitioning_kwargs, + ) elif msg.body: - elements = partition_text(text=msg.body, languages=[""]) + elements = partition_text( + text=msg.body, + metadata_filename=self._opts.metadata_file_path, + metadata_file_type=FileType.MSG, + metadata_last_modified=self._opts.metadata_last_modified, + **self._opts.partitioning_kwargs, + ) else: elements: list[Element] = [] - # -- replace the element metadata with email-specific values -- + # -- augment the element metadata with email-specific values -- + email_specific_metadata = self._opts.extra_msg_metadata for e in elements: - e.metadata = self._opts.msg_metadata + e.metadata.update(email_specific_metadata) yield e