feat: attachment processing for emails (#855)

* process attachments for email

* add attachment processing to msg

* fix up metadata for attachments

* add test for processing email attachments

* added test for processing msg attachments

* update docs

* tests for error conditions

* version and changelog
This commit is contained in:
Matt Robinson 2023-06-29 18:01:12 -04:00 committed by GitHub
parent 92e55eb89e
commit c581a33c8a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 197 additions and 16 deletions

View File

@ -2,6 +2,9 @@
### Enhancements
* `partition_email` and `partition_msg` will now process attachments if `process_attachments=True`
and a attachment partitioning functions is passed through with `attachment_partitioner=partition`.
### Features
### Fixes

View File

@ -98,8 +98,8 @@ about the library.
| Document Type | Partition Function | Strategies | Table Support | Options |
| --- | --- | --- | --- | --- |
| CSV Files (`.csv`) | `partition_csv` | N/A | Yes | None |
| E-mails (`.eml`) | `partition_eml` | N/A | No | Encoding; Max Partition |
| E-mails (`.msg`) | `partition_msg` | N/A | No | Encoding; Max Partition |
| E-mails (`.eml`) | `partition_eml` | N/A | No | Encoding; Max Partition; Process Attachments |
| E-mails (`.msg`) | `partition_msg` | N/A | No | Encoding; Max Partition; Process Attachments |
| EPubs (`.epub`) | `partition_epub` | N/A | Yes | Include Page Breaks |
| Excel Documents (`.xlsx`/`.xls`) | `partition_xlsx` | N/A | Yes | None |
| HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks |

View File

@ -273,6 +273,22 @@ the average character length for a paragraph.
You can disable ``max_partition`` by setting it to ``None``.
You can optionally partition e-mail attachments by setting ``process_attachments=True``.
If you set ``process_attachments=True``, you'll also need to pass in a partitioning
function to ``attachment_partitioner``. The following is an example of what the
workflow looks like:
.. code:: python
from unstructured.partition.auto import partition
from unstructured.partition.email import partition_email
filename = "example-docs/eml/fake-email-attachment.eml"
elements = partition_email(
filename=filename, process_attachments=True, attachment_partitioner=partition
)
``partition_epub``
---------------------
@ -439,6 +455,22 @@ the average character length for a paragraph.
You can disable ``max_partition`` by setting it to ``None``.
You can optionally partition e-mail attachments by setting ``process_attachments=True``.
If you set ``process_attachments=True``, you'll also need to pass in a partitioning
function to ``attachment_partitioner``. The following is an example of what the
workflow looks like:
.. code:: python
from unstructured.partition.auto import partition
from unstructured.partition.msg import partition_msg
filename = "example-docs/fake-email-attachment.msg"
elements = partition_msg(
filename=filename, process_attachments=True, attachment_partitioner=partition
)
``partition_multiple_via_api``
------------------------------

View File

@ -25,6 +25,7 @@ from unstructured.partition.email import (
partition_email,
partition_email_header,
)
from unstructured.partition.text import partition_text
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs", "eml")
@ -326,3 +327,42 @@ def test_partition_email_still_works_with_no_content():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "email-no-html-content-1.eml")
elements = partition_email(filename=filename)
assert elements == []
def test_partition_email_can_process_attachments(
tmpdir,
filename="example-docs/eml/fake-email-attachment.eml",
):
with open(filename) as f:
msg = email.message_from_file(f)
extract_attachment_info(msg, output_dir=tmpdir.dirname)
attachment_filename = os.path.join(tmpdir.dirname, ATTACH_EXPECTED_OUTPUT[0]["filename"])
attachment_elements = partition_text(
filename=attachment_filename,
metadata_filename=attachment_filename,
)
expected_metadata = attachment_elements[0].metadata
expected_metadata.file_directory = None
expected_metadata.attached_to_filename = filename
elements = partition_email(
filename=filename,
attachment_partitioner=partition_text,
process_attachments=True,
)
assert elements[0].text.startswith("Hello!")
for element in elements[:-1]:
assert element.metadata.filename == "fake-email-attachment.eml"
assert element.metadata.subject == "Fake email with attachment"
assert elements[-1].text == "Hey this is a fake attachment!"
assert elements[-1].metadata == expected_metadata
def test_partition_msg_raises_with_no_partitioner(
filename="example-docs/eml/fake-email-attachment.eml",
):
with pytest.raises(ValueError):
partition_email(filename=filename, process_attachments=True)

View File

@ -11,6 +11,7 @@ from unstructured.documents.elements import (
Title,
)
from unstructured.partition.msg import extract_msg_attachment_info, partition_msg
from unstructured.partition.text import partition_text
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
@ -94,3 +95,40 @@ def test_partition_msg_raises_with_both_specified():
def test_partition_msg_raises_with_neither():
with pytest.raises(ValueError):
partition_msg()
def test_partition_msg_can_process_attachments(
tmpdir,
filename="example-docs/fake-email-attachment.msg",
):
extract_msg_attachment_info(filename=filename, output_dir=tmpdir.dirname)
attachment_filename = os.path.join(tmpdir.dirname, ATTACH_EXPECTED_OUTPUT[0]["filename"])
attachment_elements = partition_text(
filename=attachment_filename,
metadata_filename=attachment_filename,
)
expected_metadata = attachment_elements[0].metadata
expected_metadata.file_directory = None
expected_metadata.attached_to_filename = filename
elements = partition_msg(
filename=filename,
attachment_partitioner=partition_text,
process_attachments=True,
)
assert elements[0].text.startswith("Hello!")
for element in elements[:-1]:
assert element.metadata.filename == "fake-email-attachment.msg"
assert element.metadata.subject == "Fake email with attachment"
assert elements[-1].text == "Hey this is a fake attachment!"
assert elements[-1].metadata == expected_metadata
def test_partition_msg_raises_with_no_partitioner(
filename="example-docs/fake-email-attachment.msg",
):
with pytest.raises(ValueError):
partition_msg(filename=filename, process_attachments=True)

View File

@ -50,6 +50,7 @@ class ElementMetadata:
file_directory: Optional[str] = None
date: Optional[str] = None
filetype: Optional[str] = None
attached_to_filename: Optional[str] = None
# Page numbers currenlty supported for PDF, HTML and PPT documents
page_number: Optional[int] = None

View File

@ -516,11 +516,14 @@ def add_metadata_with_filetype(filetype: FileType):
kwarg: params.get(kwarg) for kwarg in ("filename", "url", "text_as_html")
}
for element in elements:
_add_element_metadata(
element,
filetype=FILETYPE_TO_MIMETYPE[filetype],
**metadata_kwargs, # type: ignore
)
# NOTE(robinson) - Attached files have already run through this logic
# in their own partitioning function
if element.metadata.attached_to_filename is None:
_add_element_metadata(
element,
filetype=FILETYPE_TO_MIMETYPE[filetype],
**metadata_kwargs, # type: ignore
)
return elements
else:

View File

@ -1,11 +1,12 @@
import datetime
import email
import os
import re
import sys
from email.message import Message
from functools import partial
from tempfile import SpooledTemporaryFile
from typing import IO, Dict, List, Optional, Tuple, Union
from tempfile import SpooledTemporaryFile, TemporaryDirectory
from typing import IO, Callable, Dict, List, Optional, Tuple, Union
from unstructured.file_utils.encoding import (
COMMON_ENCODINGS,
@ -226,6 +227,9 @@ def partition_email(
encoding: Optional[str] = None,
include_headers: bool = False,
max_partition: Optional[int] = 1500,
metadata_filename: Optional[str] = None,
process_attachments: bool = False,
attachment_partitioner: Optional[Callable] = None,
**kwargs,
) -> List[Element]:
"""Partitions an .eml documents into its constituent elements.
@ -245,6 +249,13 @@ def partition_email(
max_partition
The maximum number of characters to include in a partition. If None is passed,
no maximum is applied. Only applies if processing the text/plain content.
metadata_filename
The filename to use for the metadata.
process_attachments
If True, partition_email will process email attachments in addition to
processing the content of the email itself.
attachment_partitioner
The partitioning function to use to process attachments.
"""
if content_source not in VALID_CONTENT_SOURCES:
raise ValueError(
@ -258,6 +269,8 @@ def partition_email(
# Verify that only one of the arguments was provided
exactly_one(filename=filename, file=file, text=text)
metadata_filename = metadata_filename or filename
detected_encoding = "utf-8"
if filename is not None:
extracted_encoding, msg = parse_email(filename=filename)
@ -341,7 +354,25 @@ def partition_email(
header = partition_email_header(msg)
all_elements = header + elements
metadata = build_email_metadata(msg, filename=filename)
metadata = build_email_metadata(msg, filename=metadata_filename)
for element in all_elements:
element.metadata = metadata
if process_attachments:
with TemporaryDirectory() as tmpdir:
extract_attachment_info(msg, tmpdir)
attached_files = os.listdir(tmpdir)
for attached_file in attached_files:
attached_filename = os.path.join(tmpdir, attached_file)
if attachment_partitioner is None:
raise ValueError(
"Specify the attachment_partitioner kwarg to process attachments.",
)
attached_elements = attachment_partitioner(filename=attached_filename)
for element in attached_elements:
element.metadata.filename = attached_file
element.metadata.file_directory = None
element.metadata.attached_to_filename = metadata_filename
all_elements.append(element)
return all_elements

View File

@ -1,5 +1,6 @@
import os
import tempfile
from typing import IO, Dict, List, Optional
from typing import IO, Callable, Dict, List, Optional
import msg_parser
@ -17,6 +18,9 @@ def partition_msg(
filename: Optional[str] = None,
file: Optional[IO] = None,
max_partition: Optional[int] = 1500,
metadata_filename: Optional[str] = None,
process_attachments: bool = False,
attachment_partitioner: Optional[Callable] = None,
**kwargs,
) -> List[Element]:
"""Partitions a MSFT Outlook .msg file
@ -30,6 +34,13 @@ def partition_msg(
max_partition
The maximum number of characters to include in a partition. If None is passed,
no maximum is applied. Only applies if processing text/plain content.
metadata_filename
The filename to use for the metadata.
process_attachments
If True, partition_email will process email attachments in addition to
processing the content of the email itself.
attachment_partitioner
The partitioning function to use to process attachments.
"""
exactly_one(filename=filename, file=file)
@ -41,16 +52,35 @@ def partition_msg(
tmp.close()
msg_obj = msg_parser.MsOxMessage(tmp.name)
metadata_filename = metadata_filename or filename
text = msg_obj.body
if "<html>" in text or "</div>" in text:
elements = partition_html(text=text)
else:
elements = partition_text(text=text, max_partition=max_partition)
metadata = build_msg_metadata(msg_obj, filename)
metadata = build_msg_metadata(msg_obj, metadata_filename)
for element in elements:
element.metadata = metadata
if process_attachments:
with tempfile.TemporaryDirectory() as tmpdir:
extract_msg_attachment_info(msg_obj=msg_obj, output_dir=tmpdir)
attached_files = os.listdir(tmpdir)
for attached_file in attached_files:
attached_filename = os.path.join(tmpdir, attached_file)
if attachment_partitioner is None:
raise ValueError(
"Specify the attachment_partitioner kwarg to process attachments.",
)
attached_elements = attachment_partitioner(filename=attached_filename)
for element in attached_elements:
element.metadata.filename = attached_file
element.metadata.file_directory = None
element.metadata.attached_to_filename = metadata_filename
elements.append(element)
return elements
@ -78,11 +108,12 @@ def build_msg_metadata(msg_obj: msg_parser.MsOxMessage, filename: Optional[str])
def extract_msg_attachment_info(
filename: str,
filename: Optional[str] = None,
file: Optional[IO] = None,
output_dir: Optional[str] = None,
msg_obj: Optional[msg_parser.MsOxMessage] = None,
) -> List[Dict[str, str]]:
exactly_one(filename=filename, file=file)
exactly_one(filename=filename, file=file, msg_obj=msg_obj)
if filename is not None:
msg_obj = msg_parser.MsOxMessage(filename)
@ -91,6 +122,8 @@ def extract_msg_attachment_info(
tmp.write(file.read())
tmp.close()
msg_obj = msg_parser.MsOxMessage(tmp.name)
elif msg_obj is not None:
msg_obj = msg_obj
list_attachments = []
@ -105,8 +138,8 @@ def extract_msg_attachment_info(
list_attachments.append(attachment_info)
if output_dir is not None:
filename = output_dir + "/" + attachment_info["filename"]
with open(filename, "wb") as f:
output_filename = output_dir + "/" + attachment_info["filename"]
with open(output_filename, "wb") as f:
f.write(attachment.data)
return list_attachments