feat: Add extract_attachment_info (#112)

* Adds function to extract attachments and their metadata from eml files
This commit is contained in:
Mallori Harrell 2023-01-03 11:41:54 -06:00 committed by GitHub
parent 456735735c
commit 509ad4951c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 118 additions and 4 deletions

View File

@ -1,10 +1,12 @@
## 0.3.5-dev3
## 0.3.5-dev4
* Add new pattern to recognize plain text dash bullets
* Add test for bullet patterns
* Fix for `partition_html` that allows for processing `div` tags that have both text and child
elements
* Add ability to extract document metadata from `.docx`, `.xlsx`, and `.jpg` files.
* Add new function `extract_attachment_info` that extracts and decode the attachment
of an email.
## 0.3.4

View File

@ -77,6 +77,23 @@ Examples:
text = f.read()
elements = partition_email(text=text)
``extract_attachment_info``
----------------------
The ``extract_attachment_info`` function takes an ``email.message.Message`` object
as input and returns the a list of dictionaries containing the attachment information,
such as ``filename``, ``size``, ``payload``, etc. The attachment is saved to the ``output_dir``
if specified.
.. code:: python
import email
from unstructured.partition.email import extract_attachment_info
with open("example-docs/fake-email-attachment.eml", "r") as f:
msg = email.message_from_file(f)
attachment_info = extract_attachment_info(msg, output_dir="example-docs")
``is_bulleted_text``
----------------------

View File

@ -0,0 +1,50 @@
MIME-Version: 1.0
Date: Fri, 23 Dec 2022 12:08:48 -0600
Message-ID: <CAPgNNXSzLVJ-d1OCX_TjFgJU7ugtQrjFybPtAMmmYZzphxNFYg@mail.gmail.com>
Subject: Fake email with attachment
From: Mallori Harrell <mallori@unstructured.io>
To: Mallori Harrell <mallori@unstructured.io>
Content-Type: multipart/mixed; boundary="0000000000005d654405f082adb7"
--0000000000005d654405f082adb7
Content-Type: multipart/alternative; boundary="0000000000005d654205f082adb5"
--0000000000005d654205f082adb5
Content-Type: text/plain; charset="UTF-8"
Hello!
Here's the attachments!
It includes:
- Lots of whitespace
- Little to no content
- and is a quick read
Best,
Mallori
--0000000000005d654205f082adb5
Content-Type: text/html; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
<div dir=3D"ltr">Hello!=C2=A0<div><br></div><div>Here&#39;s the attachments=
!</div><div><br></div><div>It includes:</div><div><ul><li style=3D"margin-l=
eft:15px">Lots of whitespace</li><li style=3D"margin-left:15px">Little=C2=
=A0to no content</li><li style=3D"margin-left:15px">and is a quick read</li=
></ul><div>Best,</div></div><div><br></div><div>Mallori</div><div dir=3D"lt=
r" class=3D"gmail_signature" data-smartmail=3D"gmail_signature"><div dir=3D=
"ltr"><div><div><br></div></div></div></div></div>
--0000000000005d654205f082adb5--
--0000000000005d654405f082adb7
Content-Type: text/plain; charset="US-ASCII"; name="fake-attachment.txt"
Content-Disposition: attachment; filename="fake-attachment.txt"
Content-Transfer-Encoding: base64
X-Attachment-Id: f_lc0tto5j0
Content-ID: <f_lc0tto5j0>
SGV5IHRoaXMgaXMgYSBmYWtlIGF0dGFjaG1lbnQh
--0000000000005d654405f082adb7--

View File

@ -1,9 +1,10 @@
import email
import os
import pathlib
import pytest
from unstructured.documents.elements import NarrativeText, Title, ListItem
from unstructured.partition.email import partition_email
from unstructured.partition.email import partition_email, extract_attachment_info
DIRECTORY = pathlib.Path(__file__).parent.resolve()
@ -16,6 +17,10 @@ EXPECTED_OUTPUT = [
ListItem(text="Violets are blue"),
]
ATTACH_EXPECTED_OUTPUT = [
{"filename": "fake-attachment.txt", "payload": b"Hey this is a fake attachment!"}
]
def test_partition_email_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
@ -41,6 +46,15 @@ def test_partition_email_from_text():
assert elements == EXPECTED_OUTPUT
def test_extract_attachment_info():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
with open(filename, "r") as f:
msg = email.message_from_file(f)
attachment_info = extract_attachment_info(msg)
assert len(attachment_info) > 0
assert attachment_info == ATTACH_EXPECTED_OUTPUT
def test_partition_email_raises_with_none_specified():
with pytest.raises(ValueError):
partition_email()

View File

@ -1 +1 @@
__version__ = "0.3.5-dev3" # pragma: no cover
__version__ = "0.3.5-dev4" # pragma: no cover

View File

@ -1,5 +1,6 @@
import email
import sys
from email.message import Message
from typing import Dict, IO, List, Optional
if sys.version_info < (3, 8):
@ -7,7 +8,7 @@ if sys.version_info < (3, 8):
else:
from typing import Final
from unstructured.cleaners.core import replace_mime_encodings
from unstructured.cleaners.core import replace_mime_encodings, clean_extra_whitespace
from unstructured.documents.elements import Element, Text
from unstructured.partition.html import partition_html
@ -15,6 +16,36 @@ from unstructured.partition.html import partition_html
VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html"]
def extract_attachment_info(
message: Message, output_dir: Optional[str] = None
) -> List[Dict[str, str]]:
list_attachments = []
attachment_info = {}
for part in message.walk():
if "content-disposition" in part:
cdisp = part["content-disposition"].split(";")
cdisp = [clean_extra_whitespace(item) for item in cdisp]
for item in cdisp:
if item.lower() == "attachment":
continue
key, value = item.split("=")
key = clean_extra_whitespace(key.replace('"', ""))
value = clean_extra_whitespace(value.replace('"', ""))
attachment_info[clean_extra_whitespace(key)] = clean_extra_whitespace(value)
attachment_info["payload"] = part.get_payload(decode=True)
list_attachments.append(attachment_info)
for attachment in list_attachments:
if output_dir:
filename = output_dir + "/" + attachment["filename"]
with open(filename, "wb") as f:
# mypy wants to just us `w` when opening the file but this
# causes an error since the payloads are bytes not str
f.write(attachment["payload"]) # type: ignore
return list_attachments
def partition_email(
filename: Optional[str] = None,
file: Optional[IO] = None,