rfctr(part): remove double-decoration 5 (#3692)

**Summary**
Remove double-decoration from EML and MSG.

**Additional Context**
- These needed to wait to the end because `partition_email()` and
`partition_msg()` can use any other partitioner for one of their
attachments.

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: scanny <scanny@users.noreply.github.com>
This commit is contained in:
Steve Canny 2024-10-04 14:01:32 -07:00 committed by GitHub
parent 4711a8dc26
commit 718891a447
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 219 additions and 223 deletions

View File

@ -1,4 +1,4 @@
## 0.15.14-dev10
## 0.15.14-dev11
### Enhancements
@ -16,6 +16,7 @@
* **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners.
* **Remove double-decoration for HTML, EPUB, MD, ORG, RST, and RTF partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (HTML in this case); remove decoration from delegating partitioners.
* **Remove obsolete min_partition/max_partition args from TXT and EML.** The legacy `min_partition` and `max_partition` parameters were an initial rough implementation of chunking but now interfere with chunking and are unused. Remove those parameters from `partition_text()` and `partition_email()`.
* **Remove double-decoration on EML and MSG.** Refactor these partitioners to rely on the new `@apply_metadata()` decorator operating on partitioners they delegate to (TXT, HTML, and all others for attachments) and remove direct decoration from EML and MSG.
## 0.15.13

View File

@ -19,8 +19,8 @@ from unstructured.staging.base import elements_to_json
"fake-email.txt",
{
("NarrativeText", None): 1,
("Title", None): 1,
("ListItem", None): 2,
("Title", 0): 1,
("ListItem", 1): 2,
},
),
(
@ -49,8 +49,8 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
(
"fake-email.txt",
{
("Title", None): 1,
("ListItem", None): 2,
("Title", 0): 1,
("ListItem", 1): 2,
("NarrativeText", None): 2,
},
(0.8, 0.8, 0.80),

View File

@ -471,6 +471,18 @@ def test_partition_email_from_filename_has_metadata():
assert element.metadata.filename == "fake-email.eml"
# -- .metadata.filetype --------------------------------------------------------------------------
def test_partition_email_gets_the_EMAIL_mime_type_in_metadata_filetype():
EMAIL_MIME_TYPE = "message/rfc822"
elements = partition_email(example_doc_path("fake-email.eml"))
assert all(e.metadata.filetype == EMAIL_MIME_TYPE for e in elements), (
f"Expected all elements to have '{EMAIL_MIME_TYPE}' as their filetype, but got:"
f" {repr(elements[0].metadata.filetype)}"
)
# -- .metadata.last_modified ---------------------------------------------------------------------

View File

@ -55,7 +55,6 @@ def test_partition_msg_from_filename():
languages=["eng"],
).to_dict()
)
assert all(e.metadata.filename == "fake-email.msg" for e in elements)
def test_partition_msg_from_filename_returns_uns_elements():
@ -156,6 +155,51 @@ def test_partition_msg_can_process_attachments():
]
# -- .metadata.filename --------------------------------------------------------------------------
def test_partition_msg_from_filename_gets_filename_metadata_from_file_path():
elements = partition_msg(example_doc_path("fake-email.msg"))
assert all(e.metadata.filename == "fake-email.msg" for e in elements)
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
def test_partition_msg_from_file_gets_filename_metadata_None():
with open(example_doc_path("fake-email.msg"), "rb") as f:
elements = partition_msg(file=f)
assert all(e.metadata.filename is None for e in elements)
assert all(e.metadata.file_directory is None for e in elements)
def test_partition_msg_from_filename_prefers_metadata_filename():
elements = partition_msg(example_doc_path("fake-email.msg"), metadata_filename="a/b/c.msg")
assert all(e.metadata.filename == "c.msg" for e in elements)
assert all(e.metadata.file_directory == "a/b" for e in elements)
def test_partition_msg_from_file_prefers_metadata_filename():
with open(example_doc_path("fake-email.msg"), "rb") as f:
elements = partition_msg(file=f, metadata_filename="d/e/f.msg")
assert all(e.metadata.filename == "f.msg" for e in elements)
assert all(e.metadata.file_directory == "d/e" for e in elements)
# -- .metadata.filetype --------------------------------------------------------------------------
def test_partition_msg_gets_the_MSG_mime_type_in_metadata_filetype():
MSG_MIME_TYPE = "application/vnd.ms-outlook"
elements = partition_msg(example_doc_path("fake-email.msg"))
assert all(e.metadata.filetype == MSG_MIME_TYPE for e in elements), (
f"Expected all elements to have '{MSG_MIME_TYPE}' as their filetype, but got:"
f" {repr(elements[0].metadata.filetype)}"
)
# -- .metadata.last_modified ---------------------------------------------------------------------
@ -242,6 +286,24 @@ def test_partition_msg_raises_TypeError_for_invalid_languages():
class DescribeMsgPartitionerOptions:
"""Unit-test suite for `unstructured.partition.msg.MsgPartitionerOptions` objects."""
# -- .extra_msg_metadata ---------------------
def it_provides_email_specific_metadata_to_add_to_each_element(self, opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg")
opts = MsgPartitionerOptions(**opts_args)
m = opts.extra_msg_metadata
assert m.bcc_recipient == ["hello@unstructured.io"]
assert m.cc_recipient == ["steve@unstructured.io"]
assert m.email_message_id == "14DDEF33-2BA7-4CDD-A4D8-E7C5873B37F2@gmail.com"
assert m.sent_from == ['"John" <johnjennings702@gmail.com>']
assert m.sent_to == [
"john-ctr@unstructured.io",
"steve@unstructured.io",
"hello@unstructured.io",
]
assert m.subject == "Fake email with cc and bcc recipients"
# -- .is_encrypted ---------------------------
@pytest.mark.parametrize(
@ -257,34 +319,58 @@ class DescribeMsgPartitionerOptions:
# -- .metadata_file_path ---------------------
def it_uses_the_user_provided_metadata_file_path_when_provided(self, opts_args: dict[str, Any]):
def it_uses_the_metadata_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
opts_args["file_path"] = "x/y/z.msg"
opts_args["metadata_file_path"] = "a/b/c.msg"
opts = MsgPartitionerOptions(**opts_args)
assert opts.metadata_file_path == "a/b/c.msg"
@pytest.mark.parametrize("file_path", ["u/v/w.msg", None])
def and_it_falls_back_to_the_document_file_path_otherwise_including_when_the_file_path_is_None(
self, file_path: str | None, opts_args: dict[str, Any]
):
def and_it_falls_back_to_the_MSG_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
file_path = example_doc_path("fake-email.msg")
opts_args["file_path"] = file_path
opts_args["metadata_file_path"] = None
opts = MsgPartitionerOptions(**opts_args)
assert opts.metadata_file_path == file_path
def but_it_returns_None_when_neither_path_is_available(self, opts_args: dict[str, Any]):
opts = MsgPartitionerOptions(**opts_args)
assert opts.metadata_file_path is None
# -- .metadata_last_modified -----------------
@pytest.mark.parametrize("metadata_last_modified", ["2024-03-05T17:02:53", None])
def it_knows_the_metadata_last_modified_date_provided_by_the_caller(
self, metadata_last_modified: str | None, opts_args: dict[str, Any]
):
def it_uses_metadata_last_modified_when_provided_by_the_caller(self, opts_args: dict[str, Any]):
metadata_last_modified = "2024-03-05T17:02:53"
opts_args["metadata_last_modified"] = metadata_last_modified
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts = MsgPartitionerOptions(**opts_args)
assert opts.metadata_last_modified == metadata_last_modified
def and_it_uses_the_message_Date_header_when_metadata_last_modified_is_not_provided(
self, opts_args: dict[str, Any]
):
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts = MsgPartitionerOptions(**opts_args)
assert opts.metadata_last_modified == "2023-03-28T17:00:31+00:00"
@pytest.mark.parametrize("filesystem_last_modified", ["2024-06-03T20:12:53", None])
def and_it_uses_the_last_modified_date_from_the_source_file_when_the_message_has_no_sent_date(
self,
opts_args: dict[str, Any],
filesystem_last_modified: str | None,
Message_sent_date_: Mock,
_last_modified_prop_: Mock,
):
Message_sent_date_.return_value = None
_last_modified_prop_.return_value = filesystem_last_modified
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts = MsgPartitionerOptions(**opts_args)
assert opts.metadata_last_modified == filesystem_last_modified
# -- .msg ------------------------------------
def it_loads_the_msg_document_from_a_file_path_when_provided(self, opts_args: dict[str, Any]):
@ -306,88 +392,6 @@ class DescribeMsgPartitionerOptions:
with pytest.raises(ValueError, match="one of `file` or `filename` arguments must be prov"):
MsgPartitionerOptions(**opts_args).msg
# -- .msg_metadata ---------------------------
def it_provides_a_unique_metadata_instance_for_each_element(self, opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts = MsgPartitionerOptions(**opts_args)
assert opts.msg_metadata is not opts.msg_metadata
# -- .metadata.filename ----------------------
def it_uses_the_metadata_file_path_value_for_msg_metadata(
self, opts_args: dict[str, Any], metadata_file_path_prop_: Mock
):
metadata_file_path_prop_.return_value = "a/b/c.msg"
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts = MsgPartitionerOptions(**opts_args)
assert opts.msg_metadata.filename == "c.msg"
assert opts.msg_metadata.file_directory == "a/b"
# -- .metadata.last_modified -----------------
def it_uses_metadata_last_modified_when_provided_by_caller(self, opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts_args["metadata_last_modified"] = "2024-06-03T20:07:31+00:00"
opts = MsgPartitionerOptions(**opts_args)
assert opts.msg_metadata.last_modified == "2024-06-03T20:07:31+00:00"
def and_it_uses_the_sent_date_of_the_email_when_metadata_last_modified_is_not_provided(
self, opts_args: dict[str, Any]
):
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts = MsgPartitionerOptions(**opts_args)
assert opts.msg_metadata.last_modified == "2023-03-28T17:00:31+00:00"
@pytest.mark.parametrize("file_last_modified", ["2024-06-03T20:12:53", None])
def and_it_uses_the_last_modified_date_from_the_source_file_when_the_message_has_no_sent_date(
self,
opts_args: dict[str, Any],
file_last_modified: str | None,
Message_sent_date_: Mock,
_last_modified_prop_: Mock,
):
Message_sent_date_.return_value = None
_last_modified_prop_.return_value = file_last_modified
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts = MsgPartitionerOptions(**opts_args)
assert opts.msg_metadata.last_modified == file_last_modified
# -- .metadata (email-specific) --------------
def it_adds_email_specific_fields_to_the_msg_element_metadata(self, opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("fake-email.msg")
opts = MsgPartitionerOptions(**opts_args)
assert opts.msg_metadata.sent_from == ['"Matthew Robinson" <mrobinson@unstructured.io>']
assert opts.msg_metadata.sent_to == ["mrobinson@unstructured.io"]
assert opts.msg_metadata.subject == "Test Email"
def it_captures_cc_and_bcc_element_metadata(self, opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg")
opts = MsgPartitionerOptions(**opts_args)
assert opts.msg_metadata.cc_recipient == ["steve@unstructured.io"]
assert opts.msg_metadata.bcc_recipient == ["hello@unstructured.io"]
assert opts.msg_metadata.sent_to == [
"john-ctr@unstructured.io",
"steve@unstructured.io",
"hello@unstructured.io",
]
def it_captures_email_message_id_element_metadata(self, opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg")
opts = MsgPartitionerOptions(**opts_args)
assert (
opts.msg_metadata.email_message_id == "14DDEF33-2BA7-4CDD-A4D8-E7C5873B37F2@gmail.com"
)
# -- .partition_attachments ------------------
@pytest.mark.parametrize("partition_attachments", [True, False])
@ -400,6 +404,16 @@ class DescribeMsgPartitionerOptions:
assert opts.partition_attachments is partition_attachments
# -- .partitioning_kwargs --------------------
def it_provides_access_to_pass_through_kwargs_collected_by_the_partitioner_function(
self, opts_args: dict[str, Any]
):
opts_args["kwargs"] = {"foo": 42, "bar": "baz"}
opts = MsgPartitionerOptions(**opts_args)
assert opts.partitioning_kwargs == {"foo": 42, "bar": "baz"}
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture
@ -410,10 +424,6 @@ class DescribeMsgPartitionerOptions:
def Message_sent_date_(self, request: FixtureRequest):
return property_mock(request, Message, "sent_date")
@pytest.fixture
def metadata_file_path_prop_(self, request: FixtureRequest):
return property_mock(request, MsgPartitionerOptions, "metadata_file_path")
@pytest.fixture
def opts_args(self) -> dict[str, Any]:
"""All default arguments for `MsgPartitionerOptions`.
@ -427,4 +437,5 @@ class DescribeMsgPartitionerOptions:
"metadata_file_path": None,
"metadata_last_modified": None,
"partition_attachments": False,
"kwargs": {},
}

View File

@ -1,6 +1,6 @@
[
{
"element_id": "df08d0aeb11a34e75766d2d2008d73a6",
"element_id": "e482ff3e97d6318a4c0e00aea0adf544",
"metadata": {
"data_source": {
"date_created": "2023-07-15T15:36:08",

View File

@ -1,6 +1,6 @@
[
{
"element_id": "e40af23706b4096145f1e4b007719aa5",
"element_id": "4a69e8fcddd4b6eff8488a34ba16b0dd",
"metadata": {
"data_source": {
"date_created": "2023-07-25T01:26:22",

View File

@ -1,6 +1,6 @@
[
{
"element_id": "8488a63070421b09a14ad6078c2cec2a",
"element_id": "4df3eedf1b6f98566fc40a132b48205f",
"metadata": {
"data_source": {
"date_created": "2023-07-10T03:39:04",

View File

@ -1,9 +1,13 @@
[
{
"type": "NarrativeText",
"element_id": "4196fe41da19e8657761ecffcafd3d2f",
"element_id": "191e99ff4061730e85d9300183b4ccbe",
"text": "Jane. This is a test of sending you an email from Salesforce! _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/",
"metadata": {
"languages": [
"eng"
],
"filetype": "message/rfc822",
"email_message_id": "KhIK4000000000000000000000000000000000000000000000RZP1T400CmuP1P5wTm2m679gi-mnIg@sfdc.net",
"sent_from": [
"devops+salesforce-connector@unstructured.io"
@ -12,10 +16,6 @@
"jane_gray@uoa.edu"
],
"subject": "Test of email 1",
"languages": [
"eng"
],
"filetype": "message/rfc822",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
"version": "1694691603.0",

View File

@ -1,9 +1,13 @@
[
{
"type": "NarrativeText",
"element_id": "6f168cd430b41fc0d66a3691ef3caa0f",
"element_id": "f7d72e773a4c72747c88d8ea6e5d012a",
"text": "Hey Sean. Testing email parsing here. Type: email Just testing the email system _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/",
"metadata": {
"languages": [
"eng"
],
"filetype": "message/rfc822",
"email_message_id": "CuWky000000000000000000000000000000000000000000000RZP1VO00MaLK8OmEQm2Bw-c3ek6uNg@sfdc.net",
"sent_from": [
"devops+salesforce-connector@unstructured.io"
@ -12,10 +16,6 @@
"sean@edge.com"
],
"subject": "Test of Salesforce 2",
"languages": [
"eng"
],
"filetype": "message/rfc822",
"data_source": {
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
"version": "1694691603.0",

View File

@ -1 +1 @@
__version__ = "0.15.14-dev10" # pragma: no cover
__version__ = "0.15.14-dev11" # pragma: no cover

View File

@ -1,6 +1,5 @@
from __future__ import annotations
import copy
import datetime
import email
import os
@ -12,7 +11,6 @@ from functools import partial
from tempfile import TemporaryDirectory
from typing import IO, Any, Callable, Final, Type, cast
from unstructured.chunking import add_chunking_strategy
from unstructured.cleaners.core import clean_extra_whitespace, replace_mime_encodings
from unstructured.cleaners.extract import (
extract_datetimetz,
@ -27,7 +25,6 @@ from unstructured.documents.elements import (
NarrativeText,
Text,
Title,
process_metadata,
)
from unstructured.documents.email_elements import (
MetaData,
@ -42,12 +39,10 @@ from unstructured.file_utils.encoding import (
read_txt_file,
validate_encoding,
)
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.file_utils.model import FileType
from unstructured.logger import logger
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
from unstructured.partition.common.common import convert_to_bytes, exactly_one
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.html import partition_html
from unstructured.partition.text import partition_text
@ -56,38 +51,36 @@ VALID_CONTENT_SOURCES: Final[list[str]] = ["text/html", "text/plain"]
DETECTION_ORIGIN: str = "email"
@process_metadata()
@add_metadata_with_filetype(FileType.EML)
@add_chunking_strategy
def partition_email(
filename: str | None = None,
*,
file: IO[bytes] | None = None,
encoding: str | None = None,
text: str | None = None,
content_source: str = "text/html",
encoding: str | None = None,
include_headers: bool = False,
metadata_filename: str | None = None,
metadata_last_modified: str | None = None,
process_attachments: bool = False,
attachment_partitioner: Callable[..., list[Element]] | None = None,
languages: list[str] | None = ["auto"],
detect_language_per_element: bool = False,
**kwargs: Any,
) -> list[Element]:
"""Partitions an .eml documents into its constituent elements.
Parameters
----------
filename
A string defining the target filename path.
file
A file-like object using "r" mode --> open(filename, "r").
encoding
The encoding method used to decode the input bytes when drawn from `filename` or `file`.
Defaults to "utf-8".
text
The string representation of the .eml document.
content_source
default: "text/html"
other: "text/plain"
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
metadata_filename
The filename to use for the metadata.
metadata_last_modified
@ -97,13 +90,6 @@ def partition_email(
processing the content of the email itself.
attachment_partitioner
The partitioning function to use to process attachments.
languages
User defined value for `metadata.languages` if provided. Otherwise language is detected
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
in either language.
Additional Parameters:
detect_language_per_element
Detect language per element instead of at the document level.
"""
if content_source not in VALID_CONTENT_SOURCES:
raise ValueError(
@ -211,8 +197,9 @@ def partition_email(
elements = partition_html(
text=content,
metadata_filename=metadata_filename,
languages=[""],
metadata_file_type=FileType.EML,
detection_origin="email",
**kwargs,
)
for element in elements:
if isinstance(element, Text):
@ -244,8 +231,9 @@ def partition_email(
elements = partition_text(
text=content,
encoding=encoding,
languages=[""],
metadata_file_type=FileType.EML,
detection_origin="email",
**kwargs,
)
else:
raise ValueError(
@ -274,7 +262,7 @@ def partition_email(
last_modification_date=last_modified,
)
for element in all_elements:
element.metadata = copy.deepcopy(metadata)
element.metadata.update(metadata)
if process_attachments:
with TemporaryDirectory() as tmpdir:
@ -295,15 +283,7 @@ def partition_email(
element.metadata.attached_to_filename = metadata_filename or filename
all_elements.append(element)
elements = list(
apply_lang_metadata(
elements=all_elements,
languages=languages,
detect_language_per_element=detect_language_per_element,
),
)
return elements
return all_elements
# ================================================================================================

View File

@ -1,6 +1,5 @@
from __future__ import annotations
import copy
import os
import re
import tempfile
@ -9,21 +8,15 @@ from typing import IO, Any, Iterator, Optional
from oxmsg import Message
from oxmsg.attachment import Attachment
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, ElementMetadata, process_metadata
from unstructured.file_utils.filetype import add_metadata_with_filetype
from unstructured.documents.elements import Element, ElementMetadata
from unstructured.file_utils.model import FileType
from unstructured.logger import logger
from unstructured.partition.common.lang import apply_lang_metadata
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.html import partition_html
from unstructured.partition.text import partition_text
from unstructured.utils import is_temp_file_path, lazyproperty
@process_metadata()
@add_metadata_with_filetype(FileType.MSG)
@add_chunking_strategy
def partition_msg(
filename: Optional[str] = None,
*,
@ -55,15 +48,10 @@ def partition_msg(
metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified,
partition_attachments=process_attachments,
kwargs=kwargs,
)
return list(
apply_lang_metadata(
elements=_MsgPartitioner.iter_message_elements(opts),
languages=kwargs.get("languages", ["auto"]),
detect_language_per_element=kwargs.get("detect_language_per_element", False),
)
)
return list(_MsgPartitioner.iter_message_elements(opts))
class MsgPartitionerOptions:
@ -77,12 +65,48 @@ class MsgPartitionerOptions:
metadata_file_path: str | None,
metadata_last_modified: str | None,
partition_attachments: bool,
kwargs: dict[str, Any],
):
self._file = file
self._file_path = file_path
self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified
self._partition_attachments = partition_attachments
self._kwargs = kwargs
@lazyproperty
def extra_msg_metadata(self) -> ElementMetadata:
"""ElementMetadata suitable for use on an element formed from message content.
These are only the metadata fields specific to email messages. The remaining metadata
fields produced by the delegate partitioner are used as produced.
None of these metadata fields change based on the element, so we just compute it once.
"""
msg = self.msg
sent_from = [s.strip() for s in sender.split(",")] if (sender := msg.sender) else None
sent_to = [r.email_address for r in msg.recipients] or None
bcc_recipient = (
[c.strip() for c in bcc.split(",")] if (bcc := msg.message_headers.get("Bcc")) else None
)
cc_recipient = (
[c.strip() for c in cc.split(",")] if (cc := msg.message_headers.get("Cc")) else None
)
if email_message_id := msg.message_headers.get("Message-Id"):
email_message_id = re.sub(r"^<|>$", "", email_message_id) # Strip angle brackets
element_metadata = ElementMetadata(
bcc_recipient=bcc_recipient,
cc_recipient=cc_recipient,
email_message_id=email_message_id,
sent_from=sent_from,
sent_to=sent_to,
subject=msg.subject or None,
)
element_metadata.detection_origin = "msg"
return element_metadata
@lazyproperty
def is_encrypted(self) -> bool:
@ -108,22 +132,14 @@ class MsgPartitionerOptions:
@lazyproperty
def metadata_last_modified(self) -> str | None:
"""Caller override for `.metadata.last_modified` to be applied to all elements."""
return self._metadata_last_modified
email_date = sent_date.isoformat() if (sent_date := self.msg.sent_date) else None
return self._metadata_last_modified or email_date or self._last_modified
@lazyproperty
def msg(self) -> Message:
"""The `oxmsg.Message` object loaded from file or filename."""
return Message.load(self._msg_file)
@property
def msg_metadata(self) -> ElementMetadata:
"""ElementMetadata suitable for use on an element formed from message content.
A distinct instance is returned on each reference such that downstream changes to the
metadata of one element is not also reflected in another element.
"""
return copy.copy(self._msg_metadata)
@lazyproperty
def partition_attachments(self) -> bool:
"""True when message attachments should also be partitioned."""
@ -131,22 +147,20 @@ class MsgPartitionerOptions:
@lazyproperty
def partitioning_kwargs(self) -> dict[str, Any]:
"""Partitioning keyword-arguments to be passed along to attachment partitioner."""
# TODO: no good reason we can't accept and pass along any file-type specific kwargs
# the caller might want to send along.
return {}
"""The "extra" keyword arguments received by `partition_msg()`.
These are passed along to delegate partitioners which extract keyword args like
`chunking_strategy` etc. in their decorators to control metadata behaviors, etc.
"""
return self._kwargs
@lazyproperty
def _last_modified(self) -> str | None:
"""The best last-modified date available from source-file, None if not available."""
if self._file_path:
return (
None
if is_temp_file_path(self._file_path)
else get_last_modified_date(self._file_path)
)
if not self._file_path or is_temp_file_path(self._file_path):
return None
return None
return get_last_modified_date(self._file_path)
@lazyproperty
def _msg_file(self) -> str | IO[bytes]:
@ -159,41 +173,6 @@ class MsgPartitionerOptions:
raise ValueError("one of `file` or `filename` arguments must be provided")
@property
def _msg_metadata(self) -> ElementMetadata:
"""ElementMetadata "template" for elements of this message.
None of these metadata fields change based on the element, so compute it once here and then
just make a separate copy for each element.
"""
msg = self.msg
email_date = sent_date.isoformat() if (sent_date := msg.sent_date) else None
sent_from = [s.strip() for s in sender.split(",")] if (sender := msg.sender) else None
sent_to = [r.email_address for r in msg.recipients] or None
bcc_recipient = (
[c.strip() for c in bcc.split(",")] if (bcc := msg.message_headers.get("Bcc")) else None
)
cc_recipient = (
[c.strip() for c in cc.split(",")] if (cc := msg.message_headers.get("Cc")) else None
)
if email_message_id := msg.message_headers.get("Message-Id"):
email_message_id = re.sub(r"^<|>$", "", email_message_id) # Strip angle brackets
element_metadata = ElementMetadata(
filename=self.metadata_file_path,
last_modified=self._metadata_last_modified or email_date or self._last_modified,
sent_from=sent_from,
sent_to=sent_to,
subject=msg.subject or None,
bcc_recipient=bcc_recipient,
cc_recipient=cc_recipient,
email_message_id=email_message_id,
)
element_metadata.detection_origin = "msg"
return element_metadata
class _MsgPartitioner:
"""Partitions Outlook email message (MSG) files."""
@ -230,15 +209,28 @@ class _MsgPartitioner:
msg = self._opts.msg
if html_body := msg.html_body:
elements = partition_html(text=html_body, languages=[""])
elements = partition_html(
text=html_body,
metadata_filename=self._opts.metadata_file_path,
metadata_file_type=FileType.MSG,
metadata_last_modified=self._opts.metadata_last_modified,
**self._opts.partitioning_kwargs,
)
elif msg.body:
elements = partition_text(text=msg.body, languages=[""])
elements = partition_text(
text=msg.body,
metadata_filename=self._opts.metadata_file_path,
metadata_file_type=FileType.MSG,
metadata_last_modified=self._opts.metadata_last_modified,
**self._opts.partitioning_kwargs,
)
else:
elements: list[Element] = []
# -- replace the element metadata with email-specific values --
# -- augment the element metadata with email-specific values --
email_specific_metadata = self._opts.extra_msg_metadata
for e in elements:
e.metadata = self._opts.msg_metadata
e.metadata.update(email_specific_metadata)
yield e