rfctr(part): remove double-decoration 5 (#3692)

**Summary** Remove double-decoration from EML and MSG. **Additional Context** - These needed to wait to the end because `partition_email()` and `partition_msg()` can use any other partitioner for one of their attachments. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
2025-12-14 08:44:29 +00:00 · 2024-10-04 14:01:32 -07:00 · 2024-10-04 14:01:32 -07:00 · 718891a447
commit 718891a447
parent 4711a8dc26
12 changed files with 219 additions and 223 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.15.14-dev10
+## 0.15.14-dev11

 ### Enhancements

@ -16,6 +16,7 @@
 * **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners.
 * **Remove double-decoration for HTML, EPUB, MD, ORG, RST, and RTF partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (HTML in this case); remove decoration from delegating partitioners.
 * **Remove obsolete min_partition/max_partition args from TXT and EML.** The legacy `min_partition` and `max_partition` parameters were an initial rough implementation of chunking but now interfere with chunking and are unused. Remove those parameters from `partition_text()` and `partition_email()`.
+* **Remove double-decoration on EML and MSG.** Refactor these partitioners to rely on the new `@apply_metadata()` decorator operating on partitioners they delegate to (TXT, HTML, and all others for attachments) and remove direct decoration from EML and MSG.

 ## 0.15.13

--- a/test_unstructured/metrics/test_element_type.py
+++ b/test_unstructured/metrics/test_element_type.py
@ -19,8 +19,8 @@ from unstructured.staging.base import elements_to_json
            "fake-email.txt",
            {
                ("NarrativeText", None): 1,
-                ("Title", None): 1,
-                ("ListItem", None): 2,
+                ("Title", 0): 1,
+                ("ListItem", 1): 2,
            },
        ),
        (
@ -49,8 +49,8 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
        (
            "fake-email.txt",
            {
-                ("Title", None): 1,
-                ("ListItem", None): 2,
+                ("Title", 0): 1,
+                ("ListItem", 1): 2,
                ("NarrativeText", None): 2,
            },
            (0.8, 0.8, 0.80),
--- a/test_unstructured/partition/test_email.py
+++ b/test_unstructured/partition/test_email.py
@ -471,6 +471,18 @@ def test_partition_email_from_filename_has_metadata():
        assert element.metadata.filename == "fake-email.eml"


+# -- .metadata.filetype --------------------------------------------------------------------------
+
+
+def test_partition_email_gets_the_EMAIL_mime_type_in_metadata_filetype():
+    EMAIL_MIME_TYPE = "message/rfc822"
+    elements = partition_email(example_doc_path("fake-email.eml"))
+    assert all(e.metadata.filetype == EMAIL_MIME_TYPE for e in elements), (
+        f"Expected all elements to have '{EMAIL_MIME_TYPE}' as their filetype, but got:"
+        f" {repr(elements[0].metadata.filetype)}"
+    )
+
+
 # -- .metadata.last_modified ---------------------------------------------------------------------


--- a/test_unstructured/partition/test_msg.py
+++ b/test_unstructured/partition/test_msg.py
@ -55,7 +55,6 @@ def test_partition_msg_from_filename():
            languages=["eng"],
        ).to_dict()
    )
-    assert all(e.metadata.filename == "fake-email.msg" for e in elements)


 def test_partition_msg_from_filename_returns_uns_elements():
@ -156,6 +155,51 @@ def test_partition_msg_can_process_attachments():
    ]


+# -- .metadata.filename --------------------------------------------------------------------------
+
+
+def test_partition_msg_from_filename_gets_filename_metadata_from_file_path():
+    elements = partition_msg(example_doc_path("fake-email.msg"))
+
+    assert all(e.metadata.filename == "fake-email.msg" for e in elements)
+    assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
+
+
+def test_partition_msg_from_file_gets_filename_metadata_None():
+    with open(example_doc_path("fake-email.msg"), "rb") as f:
+        elements = partition_msg(file=f)
+
+    assert all(e.metadata.filename is None for e in elements)
+    assert all(e.metadata.file_directory is None for e in elements)
+
+
+def test_partition_msg_from_filename_prefers_metadata_filename():
+    elements = partition_msg(example_doc_path("fake-email.msg"), metadata_filename="a/b/c.msg")
+
+    assert all(e.metadata.filename == "c.msg" for e in elements)
+    assert all(e.metadata.file_directory == "a/b" for e in elements)
+
+
+def test_partition_msg_from_file_prefers_metadata_filename():
+    with open(example_doc_path("fake-email.msg"), "rb") as f:
+        elements = partition_msg(file=f, metadata_filename="d/e/f.msg")
+
+    assert all(e.metadata.filename == "f.msg" for e in elements)
+    assert all(e.metadata.file_directory == "d/e" for e in elements)
+
+
+# -- .metadata.filetype --------------------------------------------------------------------------
+
+
+def test_partition_msg_gets_the_MSG_mime_type_in_metadata_filetype():
+    MSG_MIME_TYPE = "application/vnd.ms-outlook"
+    elements = partition_msg(example_doc_path("fake-email.msg"))
+    assert all(e.metadata.filetype == MSG_MIME_TYPE for e in elements), (
+        f"Expected all elements to have '{MSG_MIME_TYPE}' as their filetype, but got:"
+        f" {repr(elements[0].metadata.filetype)}"
+    )
+
+
 # -- .metadata.last_modified ---------------------------------------------------------------------


@ -242,6 +286,24 @@ def test_partition_msg_raises_TypeError_for_invalid_languages():
 class DescribeMsgPartitionerOptions:
    """Unit-test suite for `unstructured.partition.msg.MsgPartitionerOptions` objects."""

+    # -- .extra_msg_metadata ---------------------
+
+    def it_provides_email_specific_metadata_to_add_to_each_element(self, opts_args: dict[str, Any]):
+        opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg")
+        opts = MsgPartitionerOptions(**opts_args)
+
+        m = opts.extra_msg_metadata
+        assert m.bcc_recipient == ["hello@unstructured.io"]
+        assert m.cc_recipient == ["steve@unstructured.io"]
+        assert m.email_message_id == "14DDEF33-2BA7-4CDD-A4D8-E7C5873B37F2@gmail.com"
+        assert m.sent_from == ['"John" <johnjennings702@gmail.com>']
+        assert m.sent_to == [
+            "john-ctr@unstructured.io",
+            "steve@unstructured.io",
+            "hello@unstructured.io",
+        ]
+        assert m.subject == "Fake email with cc and bcc recipients"
+
    # -- .is_encrypted ---------------------------

    @pytest.mark.parametrize(
@ -257,34 +319,58 @@ class DescribeMsgPartitionerOptions:

    # -- .metadata_file_path ---------------------

-    def it_uses_the_user_provided_metadata_file_path_when_provided(self, opts_args: dict[str, Any]):
+    def it_uses_the_metadata_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
        opts_args["file_path"] = "x/y/z.msg"
        opts_args["metadata_file_path"] = "a/b/c.msg"
        opts = MsgPartitionerOptions(**opts_args)

        assert opts.metadata_file_path == "a/b/c.msg"

-    @pytest.mark.parametrize("file_path", ["u/v/w.msg", None])
-    def and_it_falls_back_to_the_document_file_path_otherwise_including_when_the_file_path_is_None(
-        self, file_path: str | None, opts_args: dict[str, Any]
-    ):
+    def and_it_falls_back_to_the_MSG_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
+        file_path = example_doc_path("fake-email.msg")
        opts_args["file_path"] = file_path
-        opts_args["metadata_file_path"] = None
        opts = MsgPartitionerOptions(**opts_args)

        assert opts.metadata_file_path == file_path

+    def but_it_returns_None_when_neither_path_is_available(self, opts_args: dict[str, Any]):
+        opts = MsgPartitionerOptions(**opts_args)
+
+        assert opts.metadata_file_path is None
+
    # -- .metadata_last_modified -----------------

-    @pytest.mark.parametrize("metadata_last_modified", ["2024-03-05T17:02:53", None])
-    def it_knows_the_metadata_last_modified_date_provided_by_the_caller(
-        self, metadata_last_modified: str | None, opts_args: dict[str, Any]
-    ):
+    def it_uses_metadata_last_modified_when_provided_by_the_caller(self, opts_args: dict[str, Any]):
+        metadata_last_modified = "2024-03-05T17:02:53"
        opts_args["metadata_last_modified"] = metadata_last_modified
+        opts_args["file_path"] = example_doc_path("fake-email.msg")
        opts = MsgPartitionerOptions(**opts_args)

        assert opts.metadata_last_modified == metadata_last_modified

+    def and_it_uses_the_message_Date_header_when_metadata_last_modified_is_not_provided(
+        self, opts_args: dict[str, Any]
+    ):
+        opts_args["file_path"] = example_doc_path("fake-email.msg")
+        opts = MsgPartitionerOptions(**opts_args)
+
+        assert opts.metadata_last_modified == "2023-03-28T17:00:31+00:00"
+
+    @pytest.mark.parametrize("filesystem_last_modified", ["2024-06-03T20:12:53", None])
+    def and_it_uses_the_last_modified_date_from_the_source_file_when_the_message_has_no_sent_date(
+        self,
+        opts_args: dict[str, Any],
+        filesystem_last_modified: str | None,
+        Message_sent_date_: Mock,
+        _last_modified_prop_: Mock,
+    ):
+        Message_sent_date_.return_value = None
+        _last_modified_prop_.return_value = filesystem_last_modified
+        opts_args["file_path"] = example_doc_path("fake-email.msg")
+        opts = MsgPartitionerOptions(**opts_args)
+
+        assert opts.metadata_last_modified == filesystem_last_modified
+
    # -- .msg ------------------------------------

    def it_loads_the_msg_document_from_a_file_path_when_provided(self, opts_args: dict[str, Any]):
@ -306,88 +392,6 @@ class DescribeMsgPartitionerOptions:
        with pytest.raises(ValueError, match="one of `file` or `filename` arguments must be prov"):
            MsgPartitionerOptions(**opts_args).msg

-    # -- .msg_metadata ---------------------------
-
-    def it_provides_a_unique_metadata_instance_for_each_element(self, opts_args: dict[str, Any]):
-        opts_args["file_path"] = example_doc_path("fake-email.msg")
-        opts = MsgPartitionerOptions(**opts_args)
-
-        assert opts.msg_metadata is not opts.msg_metadata
-
-    # -- .metadata.filename ----------------------
-
-    def it_uses_the_metadata_file_path_value_for_msg_metadata(
-        self, opts_args: dict[str, Any], metadata_file_path_prop_: Mock
-    ):
-        metadata_file_path_prop_.return_value = "a/b/c.msg"
-        opts_args["file_path"] = example_doc_path("fake-email.msg")
-        opts = MsgPartitionerOptions(**opts_args)
-
-        assert opts.msg_metadata.filename == "c.msg"
-        assert opts.msg_metadata.file_directory == "a/b"
-
-    # -- .metadata.last_modified -----------------
-
-    def it_uses_metadata_last_modified_when_provided_by_caller(self, opts_args: dict[str, Any]):
-        opts_args["file_path"] = example_doc_path("fake-email.msg")
-        opts_args["metadata_last_modified"] = "2024-06-03T20:07:31+00:00"
-        opts = MsgPartitionerOptions(**opts_args)
-
-        assert opts.msg_metadata.last_modified == "2024-06-03T20:07:31+00:00"
-
-    def and_it_uses_the_sent_date_of_the_email_when_metadata_last_modified_is_not_provided(
-        self, opts_args: dict[str, Any]
-    ):
-        opts_args["file_path"] = example_doc_path("fake-email.msg")
-        opts = MsgPartitionerOptions(**opts_args)
-
-        assert opts.msg_metadata.last_modified == "2023-03-28T17:00:31+00:00"
-
-    @pytest.mark.parametrize("file_last_modified", ["2024-06-03T20:12:53", None])
-    def and_it_uses_the_last_modified_date_from_the_source_file_when_the_message_has_no_sent_date(
-        self,
-        opts_args: dict[str, Any],
-        file_last_modified: str | None,
-        Message_sent_date_: Mock,
-        _last_modified_prop_: Mock,
-    ):
-        Message_sent_date_.return_value = None
-        _last_modified_prop_.return_value = file_last_modified
-        opts_args["file_path"] = example_doc_path("fake-email.msg")
-        opts = MsgPartitionerOptions(**opts_args)
-
-        assert opts.msg_metadata.last_modified == file_last_modified
-
-    # -- .metadata (email-specific) --------------
-
-    def it_adds_email_specific_fields_to_the_msg_element_metadata(self, opts_args: dict[str, Any]):
-        opts_args["file_path"] = example_doc_path("fake-email.msg")
-        opts = MsgPartitionerOptions(**opts_args)
-
-        assert opts.msg_metadata.sent_from == ['"Matthew Robinson" <mrobinson@unstructured.io>']
-        assert opts.msg_metadata.sent_to == ["mrobinson@unstructured.io"]
-        assert opts.msg_metadata.subject == "Test Email"
-
-    def it_captures_cc_and_bcc_element_metadata(self, opts_args: dict[str, Any]):
-        opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg")
-        opts = MsgPartitionerOptions(**opts_args)
-
-        assert opts.msg_metadata.cc_recipient == ["steve@unstructured.io"]
-        assert opts.msg_metadata.bcc_recipient == ["hello@unstructured.io"]
-        assert opts.msg_metadata.sent_to == [
-            "john-ctr@unstructured.io",
-            "steve@unstructured.io",
-            "hello@unstructured.io",
-        ]
-
-    def it_captures_email_message_id_element_metadata(self, opts_args: dict[str, Any]):
-        opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg")
-        opts = MsgPartitionerOptions(**opts_args)
-
-        assert (
-            opts.msg_metadata.email_message_id == "14DDEF33-2BA7-4CDD-A4D8-E7C5873B37F2@gmail.com"
-        )
-
    # -- .partition_attachments ------------------

    @pytest.mark.parametrize("partition_attachments", [True, False])
@ -400,6 +404,16 @@ class DescribeMsgPartitionerOptions:

        assert opts.partition_attachments is partition_attachments

+    # -- .partitioning_kwargs --------------------
+
+    def it_provides_access_to_pass_through_kwargs_collected_by_the_partitioner_function(
+        self, opts_args: dict[str, Any]
+    ):
+        opts_args["kwargs"] = {"foo": 42, "bar": "baz"}
+        opts = MsgPartitionerOptions(**opts_args)
+
+        assert opts.partitioning_kwargs == {"foo": 42, "bar": "baz"}
+
    # -- fixtures --------------------------------------------------------------------------------

    @pytest.fixture
@ -410,10 +424,6 @@ class DescribeMsgPartitionerOptions:
    def Message_sent_date_(self, request: FixtureRequest):
        return property_mock(request, Message, "sent_date")

-    @pytest.fixture
-    def metadata_file_path_prop_(self, request: FixtureRequest):
-        return property_mock(request, MsgPartitionerOptions, "metadata_file_path")
-
    @pytest.fixture
    def opts_args(self) -> dict[str, Any]:
        """All default arguments for `MsgPartitionerOptions`.
@ -427,4 +437,5 @@ class DescribeMsgPartitionerOptions:
            "metadata_file_path": None,
            "metadata_last_modified": None,
            "partition_attachments": False,
+            "kwargs": {},
        }
--- a/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json
+++ b/test_unstructured_ingest/expected-structured-output/outlook/21be155fb0c95885.eml.json
@ -1,6 +1,6 @@
 [
  {
-    "element_id": "df08d0aeb11a34e75766d2d2008d73a6",
+    "element_id": "e482ff3e97d6318a4c0e00aea0adf544",
    "metadata": {
      "data_source": {
        "date_created": "2023-07-15T15:36:08",
--- a/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json
+++ b/test_unstructured_ingest/expected-structured-output/outlook/497eba8c81c801c6.eml.json
@ -1,6 +1,6 @@
 [
  {
-    "element_id": "e40af23706b4096145f1e4b007719aa5",
+    "element_id": "4a69e8fcddd4b6eff8488a34ba16b0dd",
    "metadata": {
      "data_source": {
        "date_created": "2023-07-25T01:26:22",
--- a/test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json
+++ b/test_unstructured_ingest/expected-structured-output/outlook/4a16a411f162ebbb.eml.json
@ -1,6 +1,6 @@
 [
  {
-    "element_id": "8488a63070421b09a14ad6078c2cec2a",
+    "element_id": "4df3eedf1b6f98566fc40a132b48205f",
    "metadata": {
      "data_source": {
        "date_created": "2023-07-10T03:39:04",
--- a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json
+++ b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErPIAU.eml.json
@ -1,9 +1,13 @@
 [
  {
    "type": "NarrativeText",
-    "element_id": "4196fe41da19e8657761ecffcafd3d2f",
+    "element_id": "191e99ff4061730e85d9300183b4ccbe",
    "text": "Jane. This is a test of sending you an email from Salesforce! _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/",
    "metadata": {
+      "languages": [
+        "eng"
+      ],
+      "filetype": "message/rfc822",
      "email_message_id": "KhIK4000000000000000000000000000000000000000000000RZP1T400CmuP1P5wTm2m679gi-mnIg@sfdc.net",
      "sent_from": [
        "devops+salesforce-connector@unstructured.io"
@ -12,10 +16,6 @@
        "jane_gray@uoa.edu"
      ],
      "subject": "Test of email 1",
-      "languages": [
-        "eng"
-      ],
-      "filetype": "message/rfc822",
      "data_source": {
        "url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
        "version": "1694691603.0",
--- a/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json
+++ b/test_unstructured_ingest/expected-structured-output/salesforce/EmailMessage/02sHu00001efErQIAU.eml.json
@ -1,9 +1,13 @@
 [
  {
    "type": "NarrativeText",
-    "element_id": "6f168cd430b41fc0d66a3691ef3caa0f",
+    "element_id": "f7d72e773a4c72747c88d8ea6e5d012a",
    "text": "Hey Sean. Testing email parsing here. Type: email Just testing the email system _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/",
    "metadata": {
+      "languages": [
+        "eng"
+      ],
+      "filetype": "message/rfc822",
      "email_message_id": "CuWky000000000000000000000000000000000000000000000RZP1VO00MaLK8OmEQm2Bw-c3ek6uNg@sfdc.net",
      "sent_from": [
        "devops+salesforce-connector@unstructured.io"
@ -12,10 +16,6 @@
        "sean@edge.com"
      ],
      "subject": "Test of Salesforce 2",
-      "languages": [
-        "eng"
-      ],
-      "filetype": "message/rfc822",
      "data_source": {
        "url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
        "version": "1694691603.0",
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.15.14-dev10"  # pragma: no cover
+__version__ = "0.15.14-dev11"  # pragma: no cover
--- a/unstructured/partition/email.py
+++ b/unstructured/partition/email.py
@ -1,6 +1,5 @@
 from __future__ import annotations

-import copy
 import datetime
 import email
 import os
@ -12,7 +11,6 @@ from functools import partial
 from tempfile import TemporaryDirectory
 from typing import IO, Any, Callable, Final, Type, cast

-from unstructured.chunking import add_chunking_strategy
 from unstructured.cleaners.core import clean_extra_whitespace, replace_mime_encodings
 from unstructured.cleaners.extract import (
    extract_datetimetz,
@ -27,7 +25,6 @@ from unstructured.documents.elements import (
    NarrativeText,
    Text,
    Title,
-    process_metadata,
 )
 from unstructured.documents.email_elements import (
    MetaData,
@ -42,12 +39,10 @@ from unstructured.file_utils.encoding import (
    read_txt_file,
    validate_encoding,
 )
-from unstructured.file_utils.filetype import add_metadata_with_filetype
 from unstructured.file_utils.model import FileType
 from unstructured.logger import logger
 from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
 from unstructured.partition.common.common import convert_to_bytes, exactly_one
-from unstructured.partition.common.lang import apply_lang_metadata
 from unstructured.partition.common.metadata import get_last_modified_date
 from unstructured.partition.html import partition_html
 from unstructured.partition.text import partition_text
@ -56,38 +51,36 @@ VALID_CONTENT_SOURCES: Final[list[str]] = ["text/html", "text/plain"]
 DETECTION_ORIGIN: str = "email"


-@process_metadata()
-@add_metadata_with_filetype(FileType.EML)
-@add_chunking_strategy
 def partition_email(
    filename: str | None = None,
+    *,
    file: IO[bytes] | None = None,
+    encoding: str | None = None,
    text: str | None = None,
    content_source: str = "text/html",
-    encoding: str | None = None,
    include_headers: bool = False,
    metadata_filename: str | None = None,
    metadata_last_modified: str | None = None,
    process_attachments: bool = False,
    attachment_partitioner: Callable[..., list[Element]] | None = None,
-    languages: list[str] | None = ["auto"],
-    detect_language_per_element: bool = False,
    **kwargs: Any,
 ) -> list[Element]:
    """Partitions an .eml documents into its constituent elements.
+
    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "r" mode --> open(filename, "r").
+    encoding
+        The encoding method used to decode the input bytes when drawn from `filename` or `file`.
+        Defaults to "utf-8".
    text
        The string representation of the .eml document.
    content_source
        default: "text/html"
        other: "text/plain"
-    encoding
-        The encoding method used to decode the text input. If None, utf-8 will be used.
    metadata_filename
        The filename to use for the metadata.
    metadata_last_modified
@ -97,13 +90,6 @@ def partition_email(
        processing the content of the email itself.
    attachment_partitioner
        The partitioning function to use to process attachments.
-    languages
-        User defined value for `metadata.languages` if provided. Otherwise language is detected
-        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
-        in either language.
-        Additional Parameters:
-            detect_language_per_element
-                Detect language per element instead of at the document level.
    """
    if content_source not in VALID_CONTENT_SOURCES:
        raise ValueError(
@ -211,8 +197,9 @@ def partition_email(
        elements = partition_html(
            text=content,
            metadata_filename=metadata_filename,
-            languages=[""],
+            metadata_file_type=FileType.EML,
            detection_origin="email",
+            **kwargs,
        )
        for element in elements:
            if isinstance(element, Text):
@ -244,8 +231,9 @@ def partition_email(
        elements = partition_text(
            text=content,
            encoding=encoding,
-            languages=[""],
+            metadata_file_type=FileType.EML,
            detection_origin="email",
+            **kwargs,
        )
    else:
        raise ValueError(
@ -274,7 +262,7 @@ def partition_email(
        last_modification_date=last_modified,
    )
    for element in all_elements:
-        element.metadata = copy.deepcopy(metadata)
+        element.metadata.update(metadata)

    if process_attachments:
        with TemporaryDirectory() as tmpdir:
@ -295,15 +283,7 @@ def partition_email(
                    element.metadata.attached_to_filename = metadata_filename or filename
                    all_elements.append(element)

-    elements = list(
-        apply_lang_metadata(
-            elements=all_elements,
-            languages=languages,
-            detect_language_per_element=detect_language_per_element,
-        ),
-    )
-
-    return elements
+    return all_elements


 # ================================================================================================
--- a/unstructured/partition/msg.py
+++ b/unstructured/partition/msg.py
@ -1,6 +1,5 @@
 from __future__ import annotations

-import copy
 import os
 import re
 import tempfile
@ -9,21 +8,15 @@ from typing import IO, Any, Iterator, Optional
 from oxmsg import Message
 from oxmsg.attachment import Attachment

-from unstructured.chunking import add_chunking_strategy
-from unstructured.documents.elements import Element, ElementMetadata, process_metadata
-from unstructured.file_utils.filetype import add_metadata_with_filetype
+from unstructured.documents.elements import Element, ElementMetadata
 from unstructured.file_utils.model import FileType
 from unstructured.logger import logger
-from unstructured.partition.common.lang import apply_lang_metadata
 from unstructured.partition.common.metadata import get_last_modified_date
 from unstructured.partition.html import partition_html
 from unstructured.partition.text import partition_text
 from unstructured.utils import is_temp_file_path, lazyproperty


-@process_metadata()
-@add_metadata_with_filetype(FileType.MSG)
-@add_chunking_strategy
 def partition_msg(
    filename: Optional[str] = None,
    *,
@ -55,15 +48,10 @@ def partition_msg(
        metadata_file_path=metadata_filename,
        metadata_last_modified=metadata_last_modified,
        partition_attachments=process_attachments,
+        kwargs=kwargs,
    )

-    return list(
-        apply_lang_metadata(
-            elements=_MsgPartitioner.iter_message_elements(opts),
-            languages=kwargs.get("languages", ["auto"]),
-            detect_language_per_element=kwargs.get("detect_language_per_element", False),
-        )
-    )
+    return list(_MsgPartitioner.iter_message_elements(opts))


 class MsgPartitionerOptions:
@ -77,12 +65,48 @@ class MsgPartitionerOptions:
        metadata_file_path: str | None,
        metadata_last_modified: str | None,
        partition_attachments: bool,
+        kwargs: dict[str, Any],
    ):
        self._file = file
        self._file_path = file_path
        self._metadata_file_path = metadata_file_path
        self._metadata_last_modified = metadata_last_modified
        self._partition_attachments = partition_attachments
+        self._kwargs = kwargs
+
+    @lazyproperty
+    def extra_msg_metadata(self) -> ElementMetadata:
+        """ElementMetadata suitable for use on an element formed from message content.
+
+        These are only the metadata fields specific to email messages. The remaining metadata
+        fields produced by the delegate partitioner are used as produced.
+
+        None of these metadata fields change based on the element, so we just compute it once.
+        """
+        msg = self.msg
+
+        sent_from = [s.strip() for s in sender.split(",")] if (sender := msg.sender) else None
+        sent_to = [r.email_address for r in msg.recipients] or None
+        bcc_recipient = (
+            [c.strip() for c in bcc.split(",")] if (bcc := msg.message_headers.get("Bcc")) else None
+        )
+        cc_recipient = (
+            [c.strip() for c in cc.split(",")] if (cc := msg.message_headers.get("Cc")) else None
+        )
+        if email_message_id := msg.message_headers.get("Message-Id"):
+            email_message_id = re.sub(r"^<|>$", "", email_message_id)  # Strip angle brackets
+
+        element_metadata = ElementMetadata(
+            bcc_recipient=bcc_recipient,
+            cc_recipient=cc_recipient,
+            email_message_id=email_message_id,
+            sent_from=sent_from,
+            sent_to=sent_to,
+            subject=msg.subject or None,
+        )
+        element_metadata.detection_origin = "msg"
+
+        return element_metadata

    @lazyproperty
    def is_encrypted(self) -> bool:
@ -108,22 +132,14 @@ class MsgPartitionerOptions:
    @lazyproperty
    def metadata_last_modified(self) -> str | None:
        """Caller override for `.metadata.last_modified` to be applied to all elements."""
-        return self._metadata_last_modified
+        email_date = sent_date.isoformat() if (sent_date := self.msg.sent_date) else None
+        return self._metadata_last_modified or email_date or self._last_modified

    @lazyproperty
    def msg(self) -> Message:
        """The `oxmsg.Message` object loaded from file or filename."""
        return Message.load(self._msg_file)

-    @property
-    def msg_metadata(self) -> ElementMetadata:
-        """ElementMetadata suitable for use on an element formed from message content.
-
-        A distinct instance is returned on each reference such that downstream changes to the
-        metadata of one element is not also reflected in another element.
-        """
-        return copy.copy(self._msg_metadata)
-
    @lazyproperty
    def partition_attachments(self) -> bool:
        """True when message attachments should also be partitioned."""
@ -131,22 +147,20 @@ class MsgPartitionerOptions:

    @lazyproperty
    def partitioning_kwargs(self) -> dict[str, Any]:
-        """Partitioning keyword-arguments to be passed along to attachment partitioner."""
-        # TODO: no good reason we can't accept and pass along any file-type specific kwargs
-        # the caller might want to send along.
-        return {}
+        """The "extra" keyword arguments received by `partition_msg()`.
+
+        These are passed along to delegate partitioners which extract keyword args like
+        `chunking_strategy` etc. in their decorators to control metadata behaviors, etc.
+        """
+        return self._kwargs

    @lazyproperty
    def _last_modified(self) -> str | None:
        """The best last-modified date available from source-file, None if not available."""
-        if self._file_path:
-            return (
-                None
-                if is_temp_file_path(self._file_path)
-                else get_last_modified_date(self._file_path)
-            )
+        if not self._file_path or is_temp_file_path(self._file_path):
+            return None

-        return None
+        return get_last_modified_date(self._file_path)

    @lazyproperty
    def _msg_file(self) -> str | IO[bytes]:
@ -159,41 +173,6 @@ class MsgPartitionerOptions:

        raise ValueError("one of `file` or `filename` arguments must be provided")

-    @property
-    def _msg_metadata(self) -> ElementMetadata:
-        """ElementMetadata "template" for elements of this message.
-
-        None of these metadata fields change based on the element, so compute it once here and then
-        just make a separate copy for each element.
-        """
-        msg = self.msg
-
-        email_date = sent_date.isoformat() if (sent_date := msg.sent_date) else None
-        sent_from = [s.strip() for s in sender.split(",")] if (sender := msg.sender) else None
-        sent_to = [r.email_address for r in msg.recipients] or None
-        bcc_recipient = (
-            [c.strip() for c in bcc.split(",")] if (bcc := msg.message_headers.get("Bcc")) else None
-        )
-        cc_recipient = (
-            [c.strip() for c in cc.split(",")] if (cc := msg.message_headers.get("Cc")) else None
-        )
-        if email_message_id := msg.message_headers.get("Message-Id"):
-            email_message_id = re.sub(r"^<|>$", "", email_message_id)  # Strip angle brackets
-
-        element_metadata = ElementMetadata(
-            filename=self.metadata_file_path,
-            last_modified=self._metadata_last_modified or email_date or self._last_modified,
-            sent_from=sent_from,
-            sent_to=sent_to,
-            subject=msg.subject or None,
-            bcc_recipient=bcc_recipient,
-            cc_recipient=cc_recipient,
-            email_message_id=email_message_id,
-        )
-        element_metadata.detection_origin = "msg"
-
-        return element_metadata
-

 class _MsgPartitioner:
    """Partitions Outlook email message (MSG) files."""
@ -230,15 +209,28 @@ class _MsgPartitioner:
        msg = self._opts.msg

        if html_body := msg.html_body:
-            elements = partition_html(text=html_body, languages=[""])
+            elements = partition_html(
+                text=html_body,
+                metadata_filename=self._opts.metadata_file_path,
+                metadata_file_type=FileType.MSG,
+                metadata_last_modified=self._opts.metadata_last_modified,
+                **self._opts.partitioning_kwargs,
+            )
        elif msg.body:
-            elements = partition_text(text=msg.body, languages=[""])
+            elements = partition_text(
+                text=msg.body,
+                metadata_filename=self._opts.metadata_file_path,
+                metadata_file_type=FileType.MSG,
+                metadata_last_modified=self._opts.metadata_last_modified,
+                **self._opts.partitioning_kwargs,
+            )
        else:
            elements: list[Element] = []

-        # -- replace the element metadata with email-specific values --
+        # -- augment the element metadata with email-specific values --
+        email_specific_metadata = self._opts.extra_msg_metadata
        for e in elements:
-            e.metadata = self._opts.msg_metadata
+            e.metadata.update(email_specific_metadata)
            yield e