mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 12:19:36 +00:00
rfctr(part): remove double-decoration 5 (#3692)
**Summary** Remove double-decoration from EML and MSG. **Additional Context** - These needed to wait to the end because `partition_email()` and `partition_msg()` can use any other partitioner for one of their attachments. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com>
This commit is contained in:
parent
4711a8dc26
commit
718891a447
@ -1,4 +1,4 @@
|
||||
## 0.15.14-dev10
|
||||
## 0.15.14-dev11
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
* **Remove double-decoration for PPT, PPTX, TSV, XLSX, and XML partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner; remove decoration from delegating partitioners.
|
||||
* **Remove double-decoration for HTML, EPUB, MD, ORG, RST, and RTF partitioners.** Refactor these partitioners to use the new `@apply_metadata()` decorator and only decorate the principal partitioner (HTML in this case); remove decoration from delegating partitioners.
|
||||
* **Remove obsolete min_partition/max_partition args from TXT and EML.** The legacy `min_partition` and `max_partition` parameters were an initial rough implementation of chunking but now interfere with chunking and are unused. Remove those parameters from `partition_text()` and `partition_email()`.
|
||||
* **Remove double-decoration on EML and MSG.** Refactor these partitioners to rely on the new `@apply_metadata()` decorator operating on partitioners they delegate to (TXT, HTML, and all others for attachments) and remove direct decoration from EML and MSG.
|
||||
|
||||
## 0.15.13
|
||||
|
||||
|
@ -19,8 +19,8 @@ from unstructured.staging.base import elements_to_json
|
||||
"fake-email.txt",
|
||||
{
|
||||
("NarrativeText", None): 1,
|
||||
("Title", None): 1,
|
||||
("ListItem", None): 2,
|
||||
("Title", 0): 1,
|
||||
("ListItem", 1): 2,
|
||||
},
|
||||
),
|
||||
(
|
||||
@ -49,8 +49,8 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
|
||||
(
|
||||
"fake-email.txt",
|
||||
{
|
||||
("Title", None): 1,
|
||||
("ListItem", None): 2,
|
||||
("Title", 0): 1,
|
||||
("ListItem", 1): 2,
|
||||
("NarrativeText", None): 2,
|
||||
},
|
||||
(0.8, 0.8, 0.80),
|
||||
|
@ -471,6 +471,18 @@ def test_partition_email_from_filename_has_metadata():
|
||||
assert element.metadata.filename == "fake-email.eml"
|
||||
|
||||
|
||||
# -- .metadata.filetype --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_email_gets_the_EMAIL_mime_type_in_metadata_filetype():
|
||||
EMAIL_MIME_TYPE = "message/rfc822"
|
||||
elements = partition_email(example_doc_path("fake-email.eml"))
|
||||
assert all(e.metadata.filetype == EMAIL_MIME_TYPE for e in elements), (
|
||||
f"Expected all elements to have '{EMAIL_MIME_TYPE}' as their filetype, but got:"
|
||||
f" {repr(elements[0].metadata.filetype)}"
|
||||
)
|
||||
|
||||
|
||||
# -- .metadata.last_modified ---------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -55,7 +55,6 @@ def test_partition_msg_from_filename():
|
||||
languages=["eng"],
|
||||
).to_dict()
|
||||
)
|
||||
assert all(e.metadata.filename == "fake-email.msg" for e in elements)
|
||||
|
||||
|
||||
def test_partition_msg_from_filename_returns_uns_elements():
|
||||
@ -156,6 +155,51 @@ def test_partition_msg_can_process_attachments():
|
||||
]
|
||||
|
||||
|
||||
# -- .metadata.filename --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_msg_from_filename_gets_filename_metadata_from_file_path():
|
||||
elements = partition_msg(example_doc_path("fake-email.msg"))
|
||||
|
||||
assert all(e.metadata.filename == "fake-email.msg" for e in elements)
|
||||
assert all(e.metadata.file_directory == example_doc_path("") for e in elements)
|
||||
|
||||
|
||||
def test_partition_msg_from_file_gets_filename_metadata_None():
|
||||
with open(example_doc_path("fake-email.msg"), "rb") as f:
|
||||
elements = partition_msg(file=f)
|
||||
|
||||
assert all(e.metadata.filename is None for e in elements)
|
||||
assert all(e.metadata.file_directory is None for e in elements)
|
||||
|
||||
|
||||
def test_partition_msg_from_filename_prefers_metadata_filename():
|
||||
elements = partition_msg(example_doc_path("fake-email.msg"), metadata_filename="a/b/c.msg")
|
||||
|
||||
assert all(e.metadata.filename == "c.msg" for e in elements)
|
||||
assert all(e.metadata.file_directory == "a/b" for e in elements)
|
||||
|
||||
|
||||
def test_partition_msg_from_file_prefers_metadata_filename():
|
||||
with open(example_doc_path("fake-email.msg"), "rb") as f:
|
||||
elements = partition_msg(file=f, metadata_filename="d/e/f.msg")
|
||||
|
||||
assert all(e.metadata.filename == "f.msg" for e in elements)
|
||||
assert all(e.metadata.file_directory == "d/e" for e in elements)
|
||||
|
||||
|
||||
# -- .metadata.filetype --------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_msg_gets_the_MSG_mime_type_in_metadata_filetype():
|
||||
MSG_MIME_TYPE = "application/vnd.ms-outlook"
|
||||
elements = partition_msg(example_doc_path("fake-email.msg"))
|
||||
assert all(e.metadata.filetype == MSG_MIME_TYPE for e in elements), (
|
||||
f"Expected all elements to have '{MSG_MIME_TYPE}' as their filetype, but got:"
|
||||
f" {repr(elements[0].metadata.filetype)}"
|
||||
)
|
||||
|
||||
|
||||
# -- .metadata.last_modified ---------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -242,6 +286,24 @@ def test_partition_msg_raises_TypeError_for_invalid_languages():
|
||||
class DescribeMsgPartitionerOptions:
|
||||
"""Unit-test suite for `unstructured.partition.msg.MsgPartitionerOptions` objects."""
|
||||
|
||||
# -- .extra_msg_metadata ---------------------
|
||||
|
||||
def it_provides_email_specific_metadata_to_add_to_each_element(self, opts_args: dict[str, Any]):
|
||||
opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg")
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
m = opts.extra_msg_metadata
|
||||
assert m.bcc_recipient == ["hello@unstructured.io"]
|
||||
assert m.cc_recipient == ["steve@unstructured.io"]
|
||||
assert m.email_message_id == "14DDEF33-2BA7-4CDD-A4D8-E7C5873B37F2@gmail.com"
|
||||
assert m.sent_from == ['"John" <johnjennings702@gmail.com>']
|
||||
assert m.sent_to == [
|
||||
"john-ctr@unstructured.io",
|
||||
"steve@unstructured.io",
|
||||
"hello@unstructured.io",
|
||||
]
|
||||
assert m.subject == "Fake email with cc and bcc recipients"
|
||||
|
||||
# -- .is_encrypted ---------------------------
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -257,34 +319,58 @@ class DescribeMsgPartitionerOptions:
|
||||
|
||||
# -- .metadata_file_path ---------------------
|
||||
|
||||
def it_uses_the_user_provided_metadata_file_path_when_provided(self, opts_args: dict[str, Any]):
|
||||
def it_uses_the_metadata_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
|
||||
opts_args["file_path"] = "x/y/z.msg"
|
||||
opts_args["metadata_file_path"] = "a/b/c.msg"
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.metadata_file_path == "a/b/c.msg"
|
||||
|
||||
@pytest.mark.parametrize("file_path", ["u/v/w.msg", None])
|
||||
def and_it_falls_back_to_the_document_file_path_otherwise_including_when_the_file_path_is_None(
|
||||
self, file_path: str | None, opts_args: dict[str, Any]
|
||||
):
|
||||
def and_it_falls_back_to_the_MSG_file_path_arg_when_provided(self, opts_args: dict[str, Any]):
|
||||
file_path = example_doc_path("fake-email.msg")
|
||||
opts_args["file_path"] = file_path
|
||||
opts_args["metadata_file_path"] = None
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.metadata_file_path == file_path
|
||||
|
||||
def but_it_returns_None_when_neither_path_is_available(self, opts_args: dict[str, Any]):
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.metadata_file_path is None
|
||||
|
||||
# -- .metadata_last_modified -----------------
|
||||
|
||||
@pytest.mark.parametrize("metadata_last_modified", ["2024-03-05T17:02:53", None])
|
||||
def it_knows_the_metadata_last_modified_date_provided_by_the_caller(
|
||||
self, metadata_last_modified: str | None, opts_args: dict[str, Any]
|
||||
):
|
||||
def it_uses_metadata_last_modified_when_provided_by_the_caller(self, opts_args: dict[str, Any]):
|
||||
metadata_last_modified = "2024-03-05T17:02:53"
|
||||
opts_args["metadata_last_modified"] = metadata_last_modified
|
||||
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.metadata_last_modified == metadata_last_modified
|
||||
|
||||
def and_it_uses_the_message_Date_header_when_metadata_last_modified_is_not_provided(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.metadata_last_modified == "2023-03-28T17:00:31+00:00"
|
||||
|
||||
@pytest.mark.parametrize("filesystem_last_modified", ["2024-06-03T20:12:53", None])
|
||||
def and_it_uses_the_last_modified_date_from_the_source_file_when_the_message_has_no_sent_date(
|
||||
self,
|
||||
opts_args: dict[str, Any],
|
||||
filesystem_last_modified: str | None,
|
||||
Message_sent_date_: Mock,
|
||||
_last_modified_prop_: Mock,
|
||||
):
|
||||
Message_sent_date_.return_value = None
|
||||
_last_modified_prop_.return_value = filesystem_last_modified
|
||||
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.metadata_last_modified == filesystem_last_modified
|
||||
|
||||
# -- .msg ------------------------------------
|
||||
|
||||
def it_loads_the_msg_document_from_a_file_path_when_provided(self, opts_args: dict[str, Any]):
|
||||
@ -306,88 +392,6 @@ class DescribeMsgPartitionerOptions:
|
||||
with pytest.raises(ValueError, match="one of `file` or `filename` arguments must be prov"):
|
||||
MsgPartitionerOptions(**opts_args).msg
|
||||
|
||||
# -- .msg_metadata ---------------------------
|
||||
|
||||
def it_provides_a_unique_metadata_instance_for_each_element(self, opts_args: dict[str, Any]):
|
||||
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.msg_metadata is not opts.msg_metadata
|
||||
|
||||
# -- .metadata.filename ----------------------
|
||||
|
||||
def it_uses_the_metadata_file_path_value_for_msg_metadata(
|
||||
self, opts_args: dict[str, Any], metadata_file_path_prop_: Mock
|
||||
):
|
||||
metadata_file_path_prop_.return_value = "a/b/c.msg"
|
||||
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.msg_metadata.filename == "c.msg"
|
||||
assert opts.msg_metadata.file_directory == "a/b"
|
||||
|
||||
# -- .metadata.last_modified -----------------
|
||||
|
||||
def it_uses_metadata_last_modified_when_provided_by_caller(self, opts_args: dict[str, Any]):
|
||||
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
||||
opts_args["metadata_last_modified"] = "2024-06-03T20:07:31+00:00"
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.msg_metadata.last_modified == "2024-06-03T20:07:31+00:00"
|
||||
|
||||
def and_it_uses_the_sent_date_of_the_email_when_metadata_last_modified_is_not_provided(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.msg_metadata.last_modified == "2023-03-28T17:00:31+00:00"
|
||||
|
||||
@pytest.mark.parametrize("file_last_modified", ["2024-06-03T20:12:53", None])
|
||||
def and_it_uses_the_last_modified_date_from_the_source_file_when_the_message_has_no_sent_date(
|
||||
self,
|
||||
opts_args: dict[str, Any],
|
||||
file_last_modified: str | None,
|
||||
Message_sent_date_: Mock,
|
||||
_last_modified_prop_: Mock,
|
||||
):
|
||||
Message_sent_date_.return_value = None
|
||||
_last_modified_prop_.return_value = file_last_modified
|
||||
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.msg_metadata.last_modified == file_last_modified
|
||||
|
||||
# -- .metadata (email-specific) --------------
|
||||
|
||||
def it_adds_email_specific_fields_to_the_msg_element_metadata(self, opts_args: dict[str, Any]):
|
||||
opts_args["file_path"] = example_doc_path("fake-email.msg")
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.msg_metadata.sent_from == ['"Matthew Robinson" <mrobinson@unstructured.io>']
|
||||
assert opts.msg_metadata.sent_to == ["mrobinson@unstructured.io"]
|
||||
assert opts.msg_metadata.subject == "Test Email"
|
||||
|
||||
def it_captures_cc_and_bcc_element_metadata(self, opts_args: dict[str, Any]):
|
||||
opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg")
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.msg_metadata.cc_recipient == ["steve@unstructured.io"]
|
||||
assert opts.msg_metadata.bcc_recipient == ["hello@unstructured.io"]
|
||||
assert opts.msg_metadata.sent_to == [
|
||||
"john-ctr@unstructured.io",
|
||||
"steve@unstructured.io",
|
||||
"hello@unstructured.io",
|
||||
]
|
||||
|
||||
def it_captures_email_message_id_element_metadata(self, opts_args: dict[str, Any]):
|
||||
opts_args["file_path"] = example_doc_path("fake-email-with-cc-and-bcc.msg")
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert (
|
||||
opts.msg_metadata.email_message_id == "14DDEF33-2BA7-4CDD-A4D8-E7C5873B37F2@gmail.com"
|
||||
)
|
||||
|
||||
# -- .partition_attachments ------------------
|
||||
|
||||
@pytest.mark.parametrize("partition_attachments", [True, False])
|
||||
@ -400,6 +404,16 @@ class DescribeMsgPartitionerOptions:
|
||||
|
||||
assert opts.partition_attachments is partition_attachments
|
||||
|
||||
# -- .partitioning_kwargs --------------------
|
||||
|
||||
def it_provides_access_to_pass_through_kwargs_collected_by_the_partitioner_function(
|
||||
self, opts_args: dict[str, Any]
|
||||
):
|
||||
opts_args["kwargs"] = {"foo": 42, "bar": "baz"}
|
||||
opts = MsgPartitionerOptions(**opts_args)
|
||||
|
||||
assert opts.partitioning_kwargs == {"foo": 42, "bar": "baz"}
|
||||
|
||||
# -- fixtures --------------------------------------------------------------------------------
|
||||
|
||||
@pytest.fixture
|
||||
@ -410,10 +424,6 @@ class DescribeMsgPartitionerOptions:
|
||||
def Message_sent_date_(self, request: FixtureRequest):
|
||||
return property_mock(request, Message, "sent_date")
|
||||
|
||||
@pytest.fixture
|
||||
def metadata_file_path_prop_(self, request: FixtureRequest):
|
||||
return property_mock(request, MsgPartitionerOptions, "metadata_file_path")
|
||||
|
||||
@pytest.fixture
|
||||
def opts_args(self) -> dict[str, Any]:
|
||||
"""All default arguments for `MsgPartitionerOptions`.
|
||||
@ -427,4 +437,5 @@ class DescribeMsgPartitionerOptions:
|
||||
"metadata_file_path": None,
|
||||
"metadata_last_modified": None,
|
||||
"partition_attachments": False,
|
||||
"kwargs": {},
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
[
|
||||
{
|
||||
"element_id": "df08d0aeb11a34e75766d2d2008d73a6",
|
||||
"element_id": "e482ff3e97d6318a4c0e00aea0adf544",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-07-15T15:36:08",
|
||||
|
@ -1,6 +1,6 @@
|
||||
[
|
||||
{
|
||||
"element_id": "e40af23706b4096145f1e4b007719aa5",
|
||||
"element_id": "4a69e8fcddd4b6eff8488a34ba16b0dd",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-07-25T01:26:22",
|
||||
|
@ -1,6 +1,6 @@
|
||||
[
|
||||
{
|
||||
"element_id": "8488a63070421b09a14ad6078c2cec2a",
|
||||
"element_id": "4df3eedf1b6f98566fc40a132b48205f",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"date_created": "2023-07-10T03:39:04",
|
||||
|
@ -1,9 +1,13 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4196fe41da19e8657761ecffcafd3d2f",
|
||||
"element_id": "191e99ff4061730e85d9300183b4ccbe",
|
||||
"text": "Jane. This is a test of sending you an email from Salesforce! _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/",
|
||||
"metadata": {
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"email_message_id": "KhIK4000000000000000000000000000000000000000000000RZP1T400CmuP1P5wTm2m679gi-mnIg@sfdc.net",
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
@ -12,10 +16,6 @@
|
||||
"jane_gray@uoa.edu"
|
||||
],
|
||||
"subject": "Test of email 1",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"data_source": {
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErPIAU",
|
||||
"version": "1694691603.0",
|
||||
|
@ -1,9 +1,13 @@
|
||||
[
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "6f168cd430b41fc0d66a3691ef3caa0f",
|
||||
"element_id": "f7d72e773a4c72747c88d8ea6e5d012a",
|
||||
"text": "Hey Sean. Testing email parsing here. Type: email Just testing the email system _____________________________________________________________________ Powered by Salesforce http://www.salesforce.com/",
|
||||
"metadata": {
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"email_message_id": "CuWky000000000000000000000000000000000000000000000RZP1VO00MaLK8OmEQm2Bw-c3ek6uNg@sfdc.net",
|
||||
"sent_from": [
|
||||
"devops+salesforce-connector@unstructured.io"
|
||||
@ -12,10 +16,6 @@
|
||||
"sean@edge.com"
|
||||
],
|
||||
"subject": "Test of Salesforce 2",
|
||||
"languages": [
|
||||
"eng"
|
||||
],
|
||||
"filetype": "message/rfc822",
|
||||
"data_source": {
|
||||
"url": "/services/data/v57.0/sobjects/EmailMessage/02sHu00001efErQIAU",
|
||||
"version": "1694691603.0",
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.15.14-dev10" # pragma: no cover
|
||||
__version__ = "0.15.14-dev11" # pragma: no cover
|
||||
|
@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import datetime
|
||||
import email
|
||||
import os
|
||||
@ -12,7 +11,6 @@ from functools import partial
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import IO, Any, Callable, Final, Type, cast
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.cleaners.core import clean_extra_whitespace, replace_mime_encodings
|
||||
from unstructured.cleaners.extract import (
|
||||
extract_datetimetz,
|
||||
@ -27,7 +25,6 @@ from unstructured.documents.elements import (
|
||||
NarrativeText,
|
||||
Text,
|
||||
Title,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.documents.email_elements import (
|
||||
MetaData,
|
||||
@ -42,12 +39,10 @@ from unstructured.file_utils.encoding import (
|
||||
read_txt_file,
|
||||
validate_encoding,
|
||||
)
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
|
||||
from unstructured.partition.common.common import convert_to_bytes, exactly_one
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.text import partition_text
|
||||
@ -56,38 +51,36 @@ VALID_CONTENT_SOURCES: Final[list[str]] = ["text/html", "text/plain"]
|
||||
DETECTION_ORIGIN: str = "email"
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.EML)
|
||||
@add_chunking_strategy
|
||||
def partition_email(
|
||||
filename: str | None = None,
|
||||
*,
|
||||
file: IO[bytes] | None = None,
|
||||
encoding: str | None = None,
|
||||
text: str | None = None,
|
||||
content_source: str = "text/html",
|
||||
encoding: str | None = None,
|
||||
include_headers: bool = False,
|
||||
metadata_filename: str | None = None,
|
||||
metadata_last_modified: str | None = None,
|
||||
process_attachments: bool = False,
|
||||
attachment_partitioner: Callable[..., list[Element]] | None = None,
|
||||
languages: list[str] | None = ["auto"],
|
||||
detect_language_per_element: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions an .eml documents into its constituent elements.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "r" mode --> open(filename, "r").
|
||||
encoding
|
||||
The encoding method used to decode the input bytes when drawn from `filename` or `file`.
|
||||
Defaults to "utf-8".
|
||||
text
|
||||
The string representation of the .eml document.
|
||||
content_source
|
||||
default: "text/html"
|
||||
other: "text/plain"
|
||||
encoding
|
||||
The encoding method used to decode the text input. If None, utf-8 will be used.
|
||||
metadata_filename
|
||||
The filename to use for the metadata.
|
||||
metadata_last_modified
|
||||
@ -97,13 +90,6 @@ def partition_email(
|
||||
processing the content of the email itself.
|
||||
attachment_partitioner
|
||||
The partitioning function to use to process attachments.
|
||||
languages
|
||||
User defined value for `metadata.languages` if provided. Otherwise language is detected
|
||||
using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
|
||||
in either language.
|
||||
Additional Parameters:
|
||||
detect_language_per_element
|
||||
Detect language per element instead of at the document level.
|
||||
"""
|
||||
if content_source not in VALID_CONTENT_SOURCES:
|
||||
raise ValueError(
|
||||
@ -211,8 +197,9 @@ def partition_email(
|
||||
elements = partition_html(
|
||||
text=content,
|
||||
metadata_filename=metadata_filename,
|
||||
languages=[""],
|
||||
metadata_file_type=FileType.EML,
|
||||
detection_origin="email",
|
||||
**kwargs,
|
||||
)
|
||||
for element in elements:
|
||||
if isinstance(element, Text):
|
||||
@ -244,8 +231,9 @@ def partition_email(
|
||||
elements = partition_text(
|
||||
text=content,
|
||||
encoding=encoding,
|
||||
languages=[""],
|
||||
metadata_file_type=FileType.EML,
|
||||
detection_origin="email",
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
@ -274,7 +262,7 @@ def partition_email(
|
||||
last_modification_date=last_modified,
|
||||
)
|
||||
for element in all_elements:
|
||||
element.metadata = copy.deepcopy(metadata)
|
||||
element.metadata.update(metadata)
|
||||
|
||||
if process_attachments:
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
@ -295,15 +283,7 @@ def partition_email(
|
||||
element.metadata.attached_to_filename = metadata_filename or filename
|
||||
all_elements.append(element)
|
||||
|
||||
elements = list(
|
||||
apply_lang_metadata(
|
||||
elements=all_elements,
|
||||
languages=languages,
|
||||
detect_language_per_element=detect_language_per_element,
|
||||
),
|
||||
)
|
||||
|
||||
return elements
|
||||
return all_elements
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
|
@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
@ -9,21 +8,15 @@ from typing import IO, Any, Iterator, Optional
|
||||
from oxmsg import Message
|
||||
from oxmsg.attachment import Attachment
|
||||
|
||||
from unstructured.chunking import add_chunking_strategy
|
||||
from unstructured.documents.elements import Element, ElementMetadata, process_metadata
|
||||
from unstructured.file_utils.filetype import add_metadata_with_filetype
|
||||
from unstructured.documents.elements import Element, ElementMetadata
|
||||
from unstructured.file_utils.model import FileType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.common.lang import apply_lang_metadata
|
||||
from unstructured.partition.common.metadata import get_last_modified_date
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.text import partition_text
|
||||
from unstructured.utils import is_temp_file_path, lazyproperty
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.MSG)
|
||||
@add_chunking_strategy
|
||||
def partition_msg(
|
||||
filename: Optional[str] = None,
|
||||
*,
|
||||
@ -55,15 +48,10 @@ def partition_msg(
|
||||
metadata_file_path=metadata_filename,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
partition_attachments=process_attachments,
|
||||
kwargs=kwargs,
|
||||
)
|
||||
|
||||
return list(
|
||||
apply_lang_metadata(
|
||||
elements=_MsgPartitioner.iter_message_elements(opts),
|
||||
languages=kwargs.get("languages", ["auto"]),
|
||||
detect_language_per_element=kwargs.get("detect_language_per_element", False),
|
||||
)
|
||||
)
|
||||
return list(_MsgPartitioner.iter_message_elements(opts))
|
||||
|
||||
|
||||
class MsgPartitionerOptions:
|
||||
@ -77,12 +65,48 @@ class MsgPartitionerOptions:
|
||||
metadata_file_path: str | None,
|
||||
metadata_last_modified: str | None,
|
||||
partition_attachments: bool,
|
||||
kwargs: dict[str, Any],
|
||||
):
|
||||
self._file = file
|
||||
self._file_path = file_path
|
||||
self._metadata_file_path = metadata_file_path
|
||||
self._metadata_last_modified = metadata_last_modified
|
||||
self._partition_attachments = partition_attachments
|
||||
self._kwargs = kwargs
|
||||
|
||||
@lazyproperty
|
||||
def extra_msg_metadata(self) -> ElementMetadata:
|
||||
"""ElementMetadata suitable for use on an element formed from message content.
|
||||
|
||||
These are only the metadata fields specific to email messages. The remaining metadata
|
||||
fields produced by the delegate partitioner are used as produced.
|
||||
|
||||
None of these metadata fields change based on the element, so we just compute it once.
|
||||
"""
|
||||
msg = self.msg
|
||||
|
||||
sent_from = [s.strip() for s in sender.split(",")] if (sender := msg.sender) else None
|
||||
sent_to = [r.email_address for r in msg.recipients] or None
|
||||
bcc_recipient = (
|
||||
[c.strip() for c in bcc.split(",")] if (bcc := msg.message_headers.get("Bcc")) else None
|
||||
)
|
||||
cc_recipient = (
|
||||
[c.strip() for c in cc.split(",")] if (cc := msg.message_headers.get("Cc")) else None
|
||||
)
|
||||
if email_message_id := msg.message_headers.get("Message-Id"):
|
||||
email_message_id = re.sub(r"^<|>$", "", email_message_id) # Strip angle brackets
|
||||
|
||||
element_metadata = ElementMetadata(
|
||||
bcc_recipient=bcc_recipient,
|
||||
cc_recipient=cc_recipient,
|
||||
email_message_id=email_message_id,
|
||||
sent_from=sent_from,
|
||||
sent_to=sent_to,
|
||||
subject=msg.subject or None,
|
||||
)
|
||||
element_metadata.detection_origin = "msg"
|
||||
|
||||
return element_metadata
|
||||
|
||||
@lazyproperty
|
||||
def is_encrypted(self) -> bool:
|
||||
@ -108,22 +132,14 @@ class MsgPartitionerOptions:
|
||||
@lazyproperty
|
||||
def metadata_last_modified(self) -> str | None:
|
||||
"""Caller override for `.metadata.last_modified` to be applied to all elements."""
|
||||
return self._metadata_last_modified
|
||||
email_date = sent_date.isoformat() if (sent_date := self.msg.sent_date) else None
|
||||
return self._metadata_last_modified or email_date or self._last_modified
|
||||
|
||||
@lazyproperty
|
||||
def msg(self) -> Message:
|
||||
"""The `oxmsg.Message` object loaded from file or filename."""
|
||||
return Message.load(self._msg_file)
|
||||
|
||||
@property
|
||||
def msg_metadata(self) -> ElementMetadata:
|
||||
"""ElementMetadata suitable for use on an element formed from message content.
|
||||
|
||||
A distinct instance is returned on each reference such that downstream changes to the
|
||||
metadata of one element is not also reflected in another element.
|
||||
"""
|
||||
return copy.copy(self._msg_metadata)
|
||||
|
||||
@lazyproperty
|
||||
def partition_attachments(self) -> bool:
|
||||
"""True when message attachments should also be partitioned."""
|
||||
@ -131,22 +147,20 @@ class MsgPartitionerOptions:
|
||||
|
||||
@lazyproperty
|
||||
def partitioning_kwargs(self) -> dict[str, Any]:
|
||||
"""Partitioning keyword-arguments to be passed along to attachment partitioner."""
|
||||
# TODO: no good reason we can't accept and pass along any file-type specific kwargs
|
||||
# the caller might want to send along.
|
||||
return {}
|
||||
"""The "extra" keyword arguments received by `partition_msg()`.
|
||||
|
||||
These are passed along to delegate partitioners which extract keyword args like
|
||||
`chunking_strategy` etc. in their decorators to control metadata behaviors, etc.
|
||||
"""
|
||||
return self._kwargs
|
||||
|
||||
@lazyproperty
|
||||
def _last_modified(self) -> str | None:
|
||||
"""The best last-modified date available from source-file, None if not available."""
|
||||
if self._file_path:
|
||||
return (
|
||||
None
|
||||
if is_temp_file_path(self._file_path)
|
||||
else get_last_modified_date(self._file_path)
|
||||
)
|
||||
if not self._file_path or is_temp_file_path(self._file_path):
|
||||
return None
|
||||
|
||||
return None
|
||||
return get_last_modified_date(self._file_path)
|
||||
|
||||
@lazyproperty
|
||||
def _msg_file(self) -> str | IO[bytes]:
|
||||
@ -159,41 +173,6 @@ class MsgPartitionerOptions:
|
||||
|
||||
raise ValueError("one of `file` or `filename` arguments must be provided")
|
||||
|
||||
@property
|
||||
def _msg_metadata(self) -> ElementMetadata:
|
||||
"""ElementMetadata "template" for elements of this message.
|
||||
|
||||
None of these metadata fields change based on the element, so compute it once here and then
|
||||
just make a separate copy for each element.
|
||||
"""
|
||||
msg = self.msg
|
||||
|
||||
email_date = sent_date.isoformat() if (sent_date := msg.sent_date) else None
|
||||
sent_from = [s.strip() for s in sender.split(",")] if (sender := msg.sender) else None
|
||||
sent_to = [r.email_address for r in msg.recipients] or None
|
||||
bcc_recipient = (
|
||||
[c.strip() for c in bcc.split(",")] if (bcc := msg.message_headers.get("Bcc")) else None
|
||||
)
|
||||
cc_recipient = (
|
||||
[c.strip() for c in cc.split(",")] if (cc := msg.message_headers.get("Cc")) else None
|
||||
)
|
||||
if email_message_id := msg.message_headers.get("Message-Id"):
|
||||
email_message_id = re.sub(r"^<|>$", "", email_message_id) # Strip angle brackets
|
||||
|
||||
element_metadata = ElementMetadata(
|
||||
filename=self.metadata_file_path,
|
||||
last_modified=self._metadata_last_modified or email_date or self._last_modified,
|
||||
sent_from=sent_from,
|
||||
sent_to=sent_to,
|
||||
subject=msg.subject or None,
|
||||
bcc_recipient=bcc_recipient,
|
||||
cc_recipient=cc_recipient,
|
||||
email_message_id=email_message_id,
|
||||
)
|
||||
element_metadata.detection_origin = "msg"
|
||||
|
||||
return element_metadata
|
||||
|
||||
|
||||
class _MsgPartitioner:
|
||||
"""Partitions Outlook email message (MSG) files."""
|
||||
@ -230,15 +209,28 @@ class _MsgPartitioner:
|
||||
msg = self._opts.msg
|
||||
|
||||
if html_body := msg.html_body:
|
||||
elements = partition_html(text=html_body, languages=[""])
|
||||
elements = partition_html(
|
||||
text=html_body,
|
||||
metadata_filename=self._opts.metadata_file_path,
|
||||
metadata_file_type=FileType.MSG,
|
||||
metadata_last_modified=self._opts.metadata_last_modified,
|
||||
**self._opts.partitioning_kwargs,
|
||||
)
|
||||
elif msg.body:
|
||||
elements = partition_text(text=msg.body, languages=[""])
|
||||
elements = partition_text(
|
||||
text=msg.body,
|
||||
metadata_filename=self._opts.metadata_file_path,
|
||||
metadata_file_type=FileType.MSG,
|
||||
metadata_last_modified=self._opts.metadata_last_modified,
|
||||
**self._opts.partitioning_kwargs,
|
||||
)
|
||||
else:
|
||||
elements: list[Element] = []
|
||||
|
||||
# -- replace the element metadata with email-specific values --
|
||||
# -- augment the element metadata with email-specific values --
|
||||
email_specific_metadata = self._opts.extra_msg_metadata
|
||||
for e in elements:
|
||||
e.metadata = self._opts.msg_metadata
|
||||
e.metadata.update(email_specific_metadata)
|
||||
yield e
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user