mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 15:11:30 +00:00
feat: attachment processing for emails (#855)
* process attachments for email * add attachment processing to msg * fix up metadata for attachments * add test for processing email attachments * added test for processing msg attachments * update docs * tests for error conditions * version and changelog
This commit is contained in:
parent
92e55eb89e
commit
c581a33c8a
@ -2,6 +2,9 @@
|
|||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
* `partition_email` and `partition_msg` will now process attachments if `process_attachments=True`
|
||||||
|
and a attachment partitioning functions is passed through with `attachment_partitioner=partition`.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
@ -98,8 +98,8 @@ about the library.
|
|||||||
| Document Type | Partition Function | Strategies | Table Support | Options |
|
| Document Type | Partition Function | Strategies | Table Support | Options |
|
||||||
| --- | --- | --- | --- | --- |
|
| --- | --- | --- | --- | --- |
|
||||||
| CSV Files (`.csv`) | `partition_csv` | N/A | Yes | None |
|
| CSV Files (`.csv`) | `partition_csv` | N/A | Yes | None |
|
||||||
| E-mails (`.eml`) | `partition_eml` | N/A | No | Encoding; Max Partition |
|
| E-mails (`.eml`) | `partition_eml` | N/A | No | Encoding; Max Partition; Process Attachments |
|
||||||
| E-mails (`.msg`) | `partition_msg` | N/A | No | Encoding; Max Partition |
|
| E-mails (`.msg`) | `partition_msg` | N/A | No | Encoding; Max Partition; Process Attachments |
|
||||||
| EPubs (`.epub`) | `partition_epub` | N/A | Yes | Include Page Breaks |
|
| EPubs (`.epub`) | `partition_epub` | N/A | Yes | Include Page Breaks |
|
||||||
| Excel Documents (`.xlsx`/`.xls`) | `partition_xlsx` | N/A | Yes | None |
|
| Excel Documents (`.xlsx`/`.xls`) | `partition_xlsx` | N/A | Yes | None |
|
||||||
| HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks |
|
| HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks |
|
||||||
|
@ -273,6 +273,22 @@ the average character length for a paragraph.
|
|||||||
You can disable ``max_partition`` by setting it to ``None``.
|
You can disable ``max_partition`` by setting it to ``None``.
|
||||||
|
|
||||||
|
|
||||||
|
You can optionally partition e-mail attachments by setting ``process_attachments=True``.
|
||||||
|
If you set ``process_attachments=True``, you'll also need to pass in a partitioning
|
||||||
|
function to ``attachment_partitioner``. The following is an example of what the
|
||||||
|
workflow looks like:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
|
from unstructured.partition.email import partition_email
|
||||||
|
|
||||||
|
filename = "example-docs/eml/fake-email-attachment.eml"
|
||||||
|
elements = partition_email(
|
||||||
|
filename=filename, process_attachments=True, attachment_partitioner=partition
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
``partition_epub``
|
``partition_epub``
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
@ -439,6 +455,22 @@ the average character length for a paragraph.
|
|||||||
You can disable ``max_partition`` by setting it to ``None``.
|
You can disable ``max_partition`` by setting it to ``None``.
|
||||||
|
|
||||||
|
|
||||||
|
You can optionally partition e-mail attachments by setting ``process_attachments=True``.
|
||||||
|
If you set ``process_attachments=True``, you'll also need to pass in a partitioning
|
||||||
|
function to ``attachment_partitioner``. The following is an example of what the
|
||||||
|
workflow looks like:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
|
from unstructured.partition.msg import partition_msg
|
||||||
|
|
||||||
|
filename = "example-docs/fake-email-attachment.msg"
|
||||||
|
elements = partition_msg(
|
||||||
|
filename=filename, process_attachments=True, attachment_partitioner=partition
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
``partition_multiple_via_api``
|
``partition_multiple_via_api``
|
||||||
------------------------------
|
------------------------------
|
||||||
|
|
||||||
|
@ -25,6 +25,7 @@ from unstructured.partition.email import (
|
|||||||
partition_email,
|
partition_email,
|
||||||
partition_email_header,
|
partition_email_header,
|
||||||
)
|
)
|
||||||
|
from unstructured.partition.text import partition_text
|
||||||
|
|
||||||
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs", "eml")
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs", "eml")
|
||||||
@ -326,3 +327,42 @@ def test_partition_email_still_works_with_no_content():
|
|||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "email-no-html-content-1.eml")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "email-no-html-content-1.eml")
|
||||||
elements = partition_email(filename=filename)
|
elements = partition_email(filename=filename)
|
||||||
assert elements == []
|
assert elements == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_email_can_process_attachments(
|
||||||
|
tmpdir,
|
||||||
|
filename="example-docs/eml/fake-email-attachment.eml",
|
||||||
|
):
|
||||||
|
with open(filename) as f:
|
||||||
|
msg = email.message_from_file(f)
|
||||||
|
extract_attachment_info(msg, output_dir=tmpdir.dirname)
|
||||||
|
attachment_filename = os.path.join(tmpdir.dirname, ATTACH_EXPECTED_OUTPUT[0]["filename"])
|
||||||
|
attachment_elements = partition_text(
|
||||||
|
filename=attachment_filename,
|
||||||
|
metadata_filename=attachment_filename,
|
||||||
|
)
|
||||||
|
expected_metadata = attachment_elements[0].metadata
|
||||||
|
expected_metadata.file_directory = None
|
||||||
|
expected_metadata.attached_to_filename = filename
|
||||||
|
|
||||||
|
elements = partition_email(
|
||||||
|
filename=filename,
|
||||||
|
attachment_partitioner=partition_text,
|
||||||
|
process_attachments=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert elements[0].text.startswith("Hello!")
|
||||||
|
|
||||||
|
for element in elements[:-1]:
|
||||||
|
assert element.metadata.filename == "fake-email-attachment.eml"
|
||||||
|
assert element.metadata.subject == "Fake email with attachment"
|
||||||
|
|
||||||
|
assert elements[-1].text == "Hey this is a fake attachment!"
|
||||||
|
assert elements[-1].metadata == expected_metadata
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_msg_raises_with_no_partitioner(
|
||||||
|
filename="example-docs/eml/fake-email-attachment.eml",
|
||||||
|
):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition_email(filename=filename, process_attachments=True)
|
||||||
|
@ -11,6 +11,7 @@ from unstructured.documents.elements import (
|
|||||||
Title,
|
Title,
|
||||||
)
|
)
|
||||||
from unstructured.partition.msg import extract_msg_attachment_info, partition_msg
|
from unstructured.partition.msg import extract_msg_attachment_info, partition_msg
|
||||||
|
from unstructured.partition.text import partition_text
|
||||||
|
|
||||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||||
@ -94,3 +95,40 @@ def test_partition_msg_raises_with_both_specified():
|
|||||||
def test_partition_msg_raises_with_neither():
|
def test_partition_msg_raises_with_neither():
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
partition_msg()
|
partition_msg()
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_msg_can_process_attachments(
|
||||||
|
tmpdir,
|
||||||
|
filename="example-docs/fake-email-attachment.msg",
|
||||||
|
):
|
||||||
|
extract_msg_attachment_info(filename=filename, output_dir=tmpdir.dirname)
|
||||||
|
attachment_filename = os.path.join(tmpdir.dirname, ATTACH_EXPECTED_OUTPUT[0]["filename"])
|
||||||
|
attachment_elements = partition_text(
|
||||||
|
filename=attachment_filename,
|
||||||
|
metadata_filename=attachment_filename,
|
||||||
|
)
|
||||||
|
expected_metadata = attachment_elements[0].metadata
|
||||||
|
expected_metadata.file_directory = None
|
||||||
|
expected_metadata.attached_to_filename = filename
|
||||||
|
|
||||||
|
elements = partition_msg(
|
||||||
|
filename=filename,
|
||||||
|
attachment_partitioner=partition_text,
|
||||||
|
process_attachments=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert elements[0].text.startswith("Hello!")
|
||||||
|
|
||||||
|
for element in elements[:-1]:
|
||||||
|
assert element.metadata.filename == "fake-email-attachment.msg"
|
||||||
|
assert element.metadata.subject == "Fake email with attachment"
|
||||||
|
|
||||||
|
assert elements[-1].text == "Hey this is a fake attachment!"
|
||||||
|
assert elements[-1].metadata == expected_metadata
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_msg_raises_with_no_partitioner(
|
||||||
|
filename="example-docs/fake-email-attachment.msg",
|
||||||
|
):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition_msg(filename=filename, process_attachments=True)
|
||||||
|
@ -50,6 +50,7 @@ class ElementMetadata:
|
|||||||
file_directory: Optional[str] = None
|
file_directory: Optional[str] = None
|
||||||
date: Optional[str] = None
|
date: Optional[str] = None
|
||||||
filetype: Optional[str] = None
|
filetype: Optional[str] = None
|
||||||
|
attached_to_filename: Optional[str] = None
|
||||||
|
|
||||||
# Page numbers currenlty supported for PDF, HTML and PPT documents
|
# Page numbers currenlty supported for PDF, HTML and PPT documents
|
||||||
page_number: Optional[int] = None
|
page_number: Optional[int] = None
|
||||||
|
@ -516,6 +516,9 @@ def add_metadata_with_filetype(filetype: FileType):
|
|||||||
kwarg: params.get(kwarg) for kwarg in ("filename", "url", "text_as_html")
|
kwarg: params.get(kwarg) for kwarg in ("filename", "url", "text_as_html")
|
||||||
}
|
}
|
||||||
for element in elements:
|
for element in elements:
|
||||||
|
# NOTE(robinson) - Attached files have already run through this logic
|
||||||
|
# in their own partitioning function
|
||||||
|
if element.metadata.attached_to_filename is None:
|
||||||
_add_element_metadata(
|
_add_element_metadata(
|
||||||
element,
|
element,
|
||||||
filetype=FILETYPE_TO_MIMETYPE[filetype],
|
filetype=FILETYPE_TO_MIMETYPE[filetype],
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import email
|
import email
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
from email.message import Message
|
from email.message import Message
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile, TemporaryDirectory
|
||||||
from typing import IO, Dict, List, Optional, Tuple, Union
|
from typing import IO, Callable, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
from unstructured.file_utils.encoding import (
|
from unstructured.file_utils.encoding import (
|
||||||
COMMON_ENCODINGS,
|
COMMON_ENCODINGS,
|
||||||
@ -226,6 +227,9 @@ def partition_email(
|
|||||||
encoding: Optional[str] = None,
|
encoding: Optional[str] = None,
|
||||||
include_headers: bool = False,
|
include_headers: bool = False,
|
||||||
max_partition: Optional[int] = 1500,
|
max_partition: Optional[int] = 1500,
|
||||||
|
metadata_filename: Optional[str] = None,
|
||||||
|
process_attachments: bool = False,
|
||||||
|
attachment_partitioner: Optional[Callable] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions an .eml documents into its constituent elements.
|
"""Partitions an .eml documents into its constituent elements.
|
||||||
@ -245,6 +249,13 @@ def partition_email(
|
|||||||
max_partition
|
max_partition
|
||||||
The maximum number of characters to include in a partition. If None is passed,
|
The maximum number of characters to include in a partition. If None is passed,
|
||||||
no maximum is applied. Only applies if processing the text/plain content.
|
no maximum is applied. Only applies if processing the text/plain content.
|
||||||
|
metadata_filename
|
||||||
|
The filename to use for the metadata.
|
||||||
|
process_attachments
|
||||||
|
If True, partition_email will process email attachments in addition to
|
||||||
|
processing the content of the email itself.
|
||||||
|
attachment_partitioner
|
||||||
|
The partitioning function to use to process attachments.
|
||||||
"""
|
"""
|
||||||
if content_source not in VALID_CONTENT_SOURCES:
|
if content_source not in VALID_CONTENT_SOURCES:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -258,6 +269,8 @@ def partition_email(
|
|||||||
# Verify that only one of the arguments was provided
|
# Verify that only one of the arguments was provided
|
||||||
exactly_one(filename=filename, file=file, text=text)
|
exactly_one(filename=filename, file=file, text=text)
|
||||||
|
|
||||||
|
metadata_filename = metadata_filename or filename
|
||||||
|
|
||||||
detected_encoding = "utf-8"
|
detected_encoding = "utf-8"
|
||||||
if filename is not None:
|
if filename is not None:
|
||||||
extracted_encoding, msg = parse_email(filename=filename)
|
extracted_encoding, msg = parse_email(filename=filename)
|
||||||
@ -341,7 +354,25 @@ def partition_email(
|
|||||||
header = partition_email_header(msg)
|
header = partition_email_header(msg)
|
||||||
all_elements = header + elements
|
all_elements = header + elements
|
||||||
|
|
||||||
metadata = build_email_metadata(msg, filename=filename)
|
metadata = build_email_metadata(msg, filename=metadata_filename)
|
||||||
for element in all_elements:
|
for element in all_elements:
|
||||||
element.metadata = metadata
|
element.metadata = metadata
|
||||||
|
|
||||||
|
if process_attachments:
|
||||||
|
with TemporaryDirectory() as tmpdir:
|
||||||
|
extract_attachment_info(msg, tmpdir)
|
||||||
|
attached_files = os.listdir(tmpdir)
|
||||||
|
for attached_file in attached_files:
|
||||||
|
attached_filename = os.path.join(tmpdir, attached_file)
|
||||||
|
if attachment_partitioner is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Specify the attachment_partitioner kwarg to process attachments.",
|
||||||
|
)
|
||||||
|
attached_elements = attachment_partitioner(filename=attached_filename)
|
||||||
|
for element in attached_elements:
|
||||||
|
element.metadata.filename = attached_file
|
||||||
|
element.metadata.file_directory = None
|
||||||
|
element.metadata.attached_to_filename = metadata_filename
|
||||||
|
all_elements.append(element)
|
||||||
|
|
||||||
return all_elements
|
return all_elements
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import IO, Dict, List, Optional
|
from typing import IO, Callable, Dict, List, Optional
|
||||||
|
|
||||||
import msg_parser
|
import msg_parser
|
||||||
|
|
||||||
@ -17,6 +18,9 @@ def partition_msg(
|
|||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
file: Optional[IO] = None,
|
file: Optional[IO] = None,
|
||||||
max_partition: Optional[int] = 1500,
|
max_partition: Optional[int] = 1500,
|
||||||
|
metadata_filename: Optional[str] = None,
|
||||||
|
process_attachments: bool = False,
|
||||||
|
attachment_partitioner: Optional[Callable] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions a MSFT Outlook .msg file
|
"""Partitions a MSFT Outlook .msg file
|
||||||
@ -30,6 +34,13 @@ def partition_msg(
|
|||||||
max_partition
|
max_partition
|
||||||
The maximum number of characters to include in a partition. If None is passed,
|
The maximum number of characters to include in a partition. If None is passed,
|
||||||
no maximum is applied. Only applies if processing text/plain content.
|
no maximum is applied. Only applies if processing text/plain content.
|
||||||
|
metadata_filename
|
||||||
|
The filename to use for the metadata.
|
||||||
|
process_attachments
|
||||||
|
If True, partition_email will process email attachments in addition to
|
||||||
|
processing the content of the email itself.
|
||||||
|
attachment_partitioner
|
||||||
|
The partitioning function to use to process attachments.
|
||||||
"""
|
"""
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file)
|
||||||
|
|
||||||
@ -41,16 +52,35 @@ def partition_msg(
|
|||||||
tmp.close()
|
tmp.close()
|
||||||
msg_obj = msg_parser.MsOxMessage(tmp.name)
|
msg_obj = msg_parser.MsOxMessage(tmp.name)
|
||||||
|
|
||||||
|
metadata_filename = metadata_filename or filename
|
||||||
|
|
||||||
text = msg_obj.body
|
text = msg_obj.body
|
||||||
if "<html>" in text or "</div>" in text:
|
if "<html>" in text or "</div>" in text:
|
||||||
elements = partition_html(text=text)
|
elements = partition_html(text=text)
|
||||||
else:
|
else:
|
||||||
elements = partition_text(text=text, max_partition=max_partition)
|
elements = partition_text(text=text, max_partition=max_partition)
|
||||||
|
|
||||||
metadata = build_msg_metadata(msg_obj, filename)
|
metadata = build_msg_metadata(msg_obj, metadata_filename)
|
||||||
for element in elements:
|
for element in elements:
|
||||||
element.metadata = metadata
|
element.metadata = metadata
|
||||||
|
|
||||||
|
if process_attachments:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
extract_msg_attachment_info(msg_obj=msg_obj, output_dir=tmpdir)
|
||||||
|
attached_files = os.listdir(tmpdir)
|
||||||
|
for attached_file in attached_files:
|
||||||
|
attached_filename = os.path.join(tmpdir, attached_file)
|
||||||
|
if attachment_partitioner is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Specify the attachment_partitioner kwarg to process attachments.",
|
||||||
|
)
|
||||||
|
attached_elements = attachment_partitioner(filename=attached_filename)
|
||||||
|
for element in attached_elements:
|
||||||
|
element.metadata.filename = attached_file
|
||||||
|
element.metadata.file_directory = None
|
||||||
|
element.metadata.attached_to_filename = metadata_filename
|
||||||
|
elements.append(element)
|
||||||
|
|
||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
|
||||||
@ -78,11 +108,12 @@ def build_msg_metadata(msg_obj: msg_parser.MsOxMessage, filename: Optional[str])
|
|||||||
|
|
||||||
|
|
||||||
def extract_msg_attachment_info(
|
def extract_msg_attachment_info(
|
||||||
filename: str,
|
filename: Optional[str] = None,
|
||||||
file: Optional[IO] = None,
|
file: Optional[IO] = None,
|
||||||
output_dir: Optional[str] = None,
|
output_dir: Optional[str] = None,
|
||||||
|
msg_obj: Optional[msg_parser.MsOxMessage] = None,
|
||||||
) -> List[Dict[str, str]]:
|
) -> List[Dict[str, str]]:
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file, msg_obj=msg_obj)
|
||||||
|
|
||||||
if filename is not None:
|
if filename is not None:
|
||||||
msg_obj = msg_parser.MsOxMessage(filename)
|
msg_obj = msg_parser.MsOxMessage(filename)
|
||||||
@ -91,6 +122,8 @@ def extract_msg_attachment_info(
|
|||||||
tmp.write(file.read())
|
tmp.write(file.read())
|
||||||
tmp.close()
|
tmp.close()
|
||||||
msg_obj = msg_parser.MsOxMessage(tmp.name)
|
msg_obj = msg_parser.MsOxMessage(tmp.name)
|
||||||
|
elif msg_obj is not None:
|
||||||
|
msg_obj = msg_obj
|
||||||
|
|
||||||
list_attachments = []
|
list_attachments = []
|
||||||
|
|
||||||
@ -105,8 +138,8 @@ def extract_msg_attachment_info(
|
|||||||
list_attachments.append(attachment_info)
|
list_attachments.append(attachment_info)
|
||||||
|
|
||||||
if output_dir is not None:
|
if output_dir is not None:
|
||||||
filename = output_dir + "/" + attachment_info["filename"]
|
output_filename = output_dir + "/" + attachment_info["filename"]
|
||||||
with open(filename, "wb") as f:
|
with open(output_filename, "wb") as f:
|
||||||
f.write(attachment.data)
|
f.write(attachment.data)
|
||||||
|
|
||||||
return list_attachments
|
return list_attachments
|
||||||
|
Loading…
x
Reference in New Issue
Block a user