mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: attachment processing for emails (#855)
* process attachments for email * add attachment processing to msg * fix up metadata for attachments * add test for processing email attachments * added test for processing msg attachments * update docs * tests for error conditions * version and changelog
This commit is contained in:
parent
92e55eb89e
commit
c581a33c8a
@ -2,6 +2,9 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* `partition_email` and `partition_msg` will now process attachments if `process_attachments=True`
|
||||
and a attachment partitioning functions is passed through with `attachment_partitioner=partition`.
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
@ -98,8 +98,8 @@ about the library.
|
||||
| Document Type | Partition Function | Strategies | Table Support | Options |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| CSV Files (`.csv`) | `partition_csv` | N/A | Yes | None |
|
||||
| E-mails (`.eml`) | `partition_eml` | N/A | No | Encoding; Max Partition |
|
||||
| E-mails (`.msg`) | `partition_msg` | N/A | No | Encoding; Max Partition |
|
||||
| E-mails (`.eml`) | `partition_eml` | N/A | No | Encoding; Max Partition; Process Attachments |
|
||||
| E-mails (`.msg`) | `partition_msg` | N/A | No | Encoding; Max Partition; Process Attachments |
|
||||
| EPubs (`.epub`) | `partition_epub` | N/A | Yes | Include Page Breaks |
|
||||
| Excel Documents (`.xlsx`/`.xls`) | `partition_xlsx` | N/A | Yes | None |
|
||||
| HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks |
|
||||
|
@ -273,6 +273,22 @@ the average character length for a paragraph.
|
||||
You can disable ``max_partition`` by setting it to ``None``.
|
||||
|
||||
|
||||
You can optionally partition e-mail attachments by setting ``process_attachments=True``.
|
||||
If you set ``process_attachments=True``, you'll also need to pass in a partitioning
|
||||
function to ``attachment_partitioner``. The following is an example of what the
|
||||
workflow looks like:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.auto import partition
|
||||
from unstructured.partition.email import partition_email
|
||||
|
||||
filename = "example-docs/eml/fake-email-attachment.eml"
|
||||
elements = partition_email(
|
||||
filename=filename, process_attachments=True, attachment_partitioner=partition
|
||||
)
|
||||
|
||||
|
||||
``partition_epub``
|
||||
---------------------
|
||||
|
||||
@ -439,6 +455,22 @@ the average character length for a paragraph.
|
||||
You can disable ``max_partition`` by setting it to ``None``.
|
||||
|
||||
|
||||
You can optionally partition e-mail attachments by setting ``process_attachments=True``.
|
||||
If you set ``process_attachments=True``, you'll also need to pass in a partitioning
|
||||
function to ``attachment_partitioner``. The following is an example of what the
|
||||
workflow looks like:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.auto import partition
|
||||
from unstructured.partition.msg import partition_msg
|
||||
|
||||
filename = "example-docs/fake-email-attachment.msg"
|
||||
elements = partition_msg(
|
||||
filename=filename, process_attachments=True, attachment_partitioner=partition
|
||||
)
|
||||
|
||||
|
||||
``partition_multiple_via_api``
|
||||
------------------------------
|
||||
|
||||
|
@ -25,6 +25,7 @@ from unstructured.partition.email import (
|
||||
partition_email,
|
||||
partition_email_header,
|
||||
)
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs", "eml")
|
||||
@ -326,3 +327,42 @@ def test_partition_email_still_works_with_no_content():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "email-no-html-content-1.eml")
|
||||
elements = partition_email(filename=filename)
|
||||
assert elements == []
|
||||
|
||||
|
||||
def test_partition_email_can_process_attachments(
|
||||
tmpdir,
|
||||
filename="example-docs/eml/fake-email-attachment.eml",
|
||||
):
|
||||
with open(filename) as f:
|
||||
msg = email.message_from_file(f)
|
||||
extract_attachment_info(msg, output_dir=tmpdir.dirname)
|
||||
attachment_filename = os.path.join(tmpdir.dirname, ATTACH_EXPECTED_OUTPUT[0]["filename"])
|
||||
attachment_elements = partition_text(
|
||||
filename=attachment_filename,
|
||||
metadata_filename=attachment_filename,
|
||||
)
|
||||
expected_metadata = attachment_elements[0].metadata
|
||||
expected_metadata.file_directory = None
|
||||
expected_metadata.attached_to_filename = filename
|
||||
|
||||
elements = partition_email(
|
||||
filename=filename,
|
||||
attachment_partitioner=partition_text,
|
||||
process_attachments=True,
|
||||
)
|
||||
|
||||
assert elements[0].text.startswith("Hello!")
|
||||
|
||||
for element in elements[:-1]:
|
||||
assert element.metadata.filename == "fake-email-attachment.eml"
|
||||
assert element.metadata.subject == "Fake email with attachment"
|
||||
|
||||
assert elements[-1].text == "Hey this is a fake attachment!"
|
||||
assert elements[-1].metadata == expected_metadata
|
||||
|
||||
|
||||
def test_partition_msg_raises_with_no_partitioner(
|
||||
filename="example-docs/eml/fake-email-attachment.eml",
|
||||
):
|
||||
with pytest.raises(ValueError):
|
||||
partition_email(filename=filename, process_attachments=True)
|
||||
|
@ -11,6 +11,7 @@ from unstructured.documents.elements import (
|
||||
Title,
|
||||
)
|
||||
from unstructured.partition.msg import extract_msg_attachment_info, partition_msg
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||
@ -94,3 +95,40 @@ def test_partition_msg_raises_with_both_specified():
|
||||
def test_partition_msg_raises_with_neither():
|
||||
with pytest.raises(ValueError):
|
||||
partition_msg()
|
||||
|
||||
|
||||
def test_partition_msg_can_process_attachments(
|
||||
tmpdir,
|
||||
filename="example-docs/fake-email-attachment.msg",
|
||||
):
|
||||
extract_msg_attachment_info(filename=filename, output_dir=tmpdir.dirname)
|
||||
attachment_filename = os.path.join(tmpdir.dirname, ATTACH_EXPECTED_OUTPUT[0]["filename"])
|
||||
attachment_elements = partition_text(
|
||||
filename=attachment_filename,
|
||||
metadata_filename=attachment_filename,
|
||||
)
|
||||
expected_metadata = attachment_elements[0].metadata
|
||||
expected_metadata.file_directory = None
|
||||
expected_metadata.attached_to_filename = filename
|
||||
|
||||
elements = partition_msg(
|
||||
filename=filename,
|
||||
attachment_partitioner=partition_text,
|
||||
process_attachments=True,
|
||||
)
|
||||
|
||||
assert elements[0].text.startswith("Hello!")
|
||||
|
||||
for element in elements[:-1]:
|
||||
assert element.metadata.filename == "fake-email-attachment.msg"
|
||||
assert element.metadata.subject == "Fake email with attachment"
|
||||
|
||||
assert elements[-1].text == "Hey this is a fake attachment!"
|
||||
assert elements[-1].metadata == expected_metadata
|
||||
|
||||
|
||||
def test_partition_msg_raises_with_no_partitioner(
|
||||
filename="example-docs/fake-email-attachment.msg",
|
||||
):
|
||||
with pytest.raises(ValueError):
|
||||
partition_msg(filename=filename, process_attachments=True)
|
||||
|
@ -50,6 +50,7 @@ class ElementMetadata:
|
||||
file_directory: Optional[str] = None
|
||||
date: Optional[str] = None
|
||||
filetype: Optional[str] = None
|
||||
attached_to_filename: Optional[str] = None
|
||||
|
||||
# Page numbers currenlty supported for PDF, HTML and PPT documents
|
||||
page_number: Optional[int] = None
|
||||
|
@ -516,11 +516,14 @@ def add_metadata_with_filetype(filetype: FileType):
|
||||
kwarg: params.get(kwarg) for kwarg in ("filename", "url", "text_as_html")
|
||||
}
|
||||
for element in elements:
|
||||
_add_element_metadata(
|
||||
element,
|
||||
filetype=FILETYPE_TO_MIMETYPE[filetype],
|
||||
**metadata_kwargs, # type: ignore
|
||||
)
|
||||
# NOTE(robinson) - Attached files have already run through this logic
|
||||
# in their own partitioning function
|
||||
if element.metadata.attached_to_filename is None:
|
||||
_add_element_metadata(
|
||||
element,
|
||||
filetype=FILETYPE_TO_MIMETYPE[filetype],
|
||||
**metadata_kwargs, # type: ignore
|
||||
)
|
||||
|
||||
return elements
|
||||
else:
|
||||
|
@ -1,11 +1,12 @@
|
||||
import datetime
|
||||
import email
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from email.message import Message
|
||||
from functools import partial
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from typing import IO, Dict, List, Optional, Tuple, Union
|
||||
from tempfile import SpooledTemporaryFile, TemporaryDirectory
|
||||
from typing import IO, Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from unstructured.file_utils.encoding import (
|
||||
COMMON_ENCODINGS,
|
||||
@ -226,6 +227,9 @@ def partition_email(
|
||||
encoding: Optional[str] = None,
|
||||
include_headers: bool = False,
|
||||
max_partition: Optional[int] = 1500,
|
||||
metadata_filename: Optional[str] = None,
|
||||
process_attachments: bool = False,
|
||||
attachment_partitioner: Optional[Callable] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions an .eml documents into its constituent elements.
|
||||
@ -245,6 +249,13 @@ def partition_email(
|
||||
max_partition
|
||||
The maximum number of characters to include in a partition. If None is passed,
|
||||
no maximum is applied. Only applies if processing the text/plain content.
|
||||
metadata_filename
|
||||
The filename to use for the metadata.
|
||||
process_attachments
|
||||
If True, partition_email will process email attachments in addition to
|
||||
processing the content of the email itself.
|
||||
attachment_partitioner
|
||||
The partitioning function to use to process attachments.
|
||||
"""
|
||||
if content_source not in VALID_CONTENT_SOURCES:
|
||||
raise ValueError(
|
||||
@ -258,6 +269,8 @@ def partition_email(
|
||||
# Verify that only one of the arguments was provided
|
||||
exactly_one(filename=filename, file=file, text=text)
|
||||
|
||||
metadata_filename = metadata_filename or filename
|
||||
|
||||
detected_encoding = "utf-8"
|
||||
if filename is not None:
|
||||
extracted_encoding, msg = parse_email(filename=filename)
|
||||
@ -341,7 +354,25 @@ def partition_email(
|
||||
header = partition_email_header(msg)
|
||||
all_elements = header + elements
|
||||
|
||||
metadata = build_email_metadata(msg, filename=filename)
|
||||
metadata = build_email_metadata(msg, filename=metadata_filename)
|
||||
for element in all_elements:
|
||||
element.metadata = metadata
|
||||
|
||||
if process_attachments:
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
extract_attachment_info(msg, tmpdir)
|
||||
attached_files = os.listdir(tmpdir)
|
||||
for attached_file in attached_files:
|
||||
attached_filename = os.path.join(tmpdir, attached_file)
|
||||
if attachment_partitioner is None:
|
||||
raise ValueError(
|
||||
"Specify the attachment_partitioner kwarg to process attachments.",
|
||||
)
|
||||
attached_elements = attachment_partitioner(filename=attached_filename)
|
||||
for element in attached_elements:
|
||||
element.metadata.filename = attached_file
|
||||
element.metadata.file_directory = None
|
||||
element.metadata.attached_to_filename = metadata_filename
|
||||
all_elements.append(element)
|
||||
|
||||
return all_elements
|
||||
|
@ -1,5 +1,6 @@
|
||||
import os
|
||||
import tempfile
|
||||
from typing import IO, Dict, List, Optional
|
||||
from typing import IO, Callable, Dict, List, Optional
|
||||
|
||||
import msg_parser
|
||||
|
||||
@ -17,6 +18,9 @@ def partition_msg(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
max_partition: Optional[int] = 1500,
|
||||
metadata_filename: Optional[str] = None,
|
||||
process_attachments: bool = False,
|
||||
attachment_partitioner: Optional[Callable] = None,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Partitions a MSFT Outlook .msg file
|
||||
@ -30,6 +34,13 @@ def partition_msg(
|
||||
max_partition
|
||||
The maximum number of characters to include in a partition. If None is passed,
|
||||
no maximum is applied. Only applies if processing text/plain content.
|
||||
metadata_filename
|
||||
The filename to use for the metadata.
|
||||
process_attachments
|
||||
If True, partition_email will process email attachments in addition to
|
||||
processing the content of the email itself.
|
||||
attachment_partitioner
|
||||
The partitioning function to use to process attachments.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
@ -41,16 +52,35 @@ def partition_msg(
|
||||
tmp.close()
|
||||
msg_obj = msg_parser.MsOxMessage(tmp.name)
|
||||
|
||||
metadata_filename = metadata_filename or filename
|
||||
|
||||
text = msg_obj.body
|
||||
if "<html>" in text or "</div>" in text:
|
||||
elements = partition_html(text=text)
|
||||
else:
|
||||
elements = partition_text(text=text, max_partition=max_partition)
|
||||
|
||||
metadata = build_msg_metadata(msg_obj, filename)
|
||||
metadata = build_msg_metadata(msg_obj, metadata_filename)
|
||||
for element in elements:
|
||||
element.metadata = metadata
|
||||
|
||||
if process_attachments:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
extract_msg_attachment_info(msg_obj=msg_obj, output_dir=tmpdir)
|
||||
attached_files = os.listdir(tmpdir)
|
||||
for attached_file in attached_files:
|
||||
attached_filename = os.path.join(tmpdir, attached_file)
|
||||
if attachment_partitioner is None:
|
||||
raise ValueError(
|
||||
"Specify the attachment_partitioner kwarg to process attachments.",
|
||||
)
|
||||
attached_elements = attachment_partitioner(filename=attached_filename)
|
||||
for element in attached_elements:
|
||||
element.metadata.filename = attached_file
|
||||
element.metadata.file_directory = None
|
||||
element.metadata.attached_to_filename = metadata_filename
|
||||
elements.append(element)
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
@ -78,11 +108,12 @@ def build_msg_metadata(msg_obj: msg_parser.MsOxMessage, filename: Optional[str])
|
||||
|
||||
|
||||
def extract_msg_attachment_info(
|
||||
filename: str,
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
output_dir: Optional[str] = None,
|
||||
msg_obj: Optional[msg_parser.MsOxMessage] = None,
|
||||
) -> List[Dict[str, str]]:
|
||||
exactly_one(filename=filename, file=file)
|
||||
exactly_one(filename=filename, file=file, msg_obj=msg_obj)
|
||||
|
||||
if filename is not None:
|
||||
msg_obj = msg_parser.MsOxMessage(filename)
|
||||
@ -91,6 +122,8 @@ def extract_msg_attachment_info(
|
||||
tmp.write(file.read())
|
||||
tmp.close()
|
||||
msg_obj = msg_parser.MsOxMessage(tmp.name)
|
||||
elif msg_obj is not None:
|
||||
msg_obj = msg_obj
|
||||
|
||||
list_attachments = []
|
||||
|
||||
@ -105,8 +138,8 @@ def extract_msg_attachment_info(
|
||||
list_attachments.append(attachment_info)
|
||||
|
||||
if output_dir is not None:
|
||||
filename = output_dir + "/" + attachment_info["filename"]
|
||||
with open(filename, "wb") as f:
|
||||
output_filename = output_dir + "/" + attachment_info["filename"]
|
||||
with open(output_filename, "wb") as f:
|
||||
f.write(attachment.data)
|
||||
|
||||
return list_attachments
|
||||
|
Loading…
x
Reference in New Issue
Block a user