mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: Add extract_attachment_info
(#112)
* Adds function to extract attachments and their metadata from eml files
This commit is contained in:
parent
456735735c
commit
509ad4951c
@ -1,10 +1,12 @@
|
||||
## 0.3.5-dev3
|
||||
## 0.3.5-dev4
|
||||
|
||||
* Add new pattern to recognize plain text dash bullets
|
||||
* Add test for bullet patterns
|
||||
* Fix for `partition_html` that allows for processing `div` tags that have both text and child
|
||||
elements
|
||||
* Add ability to extract document metadata from `.docx`, `.xlsx`, and `.jpg` files.
|
||||
* Add new function `extract_attachment_info` that extracts and decode the attachment
|
||||
of an email.
|
||||
|
||||
## 0.3.4
|
||||
|
||||
|
@ -77,6 +77,23 @@ Examples:
|
||||
text = f.read()
|
||||
elements = partition_email(text=text)
|
||||
|
||||
``extract_attachment_info``
|
||||
----------------------
|
||||
|
||||
The ``extract_attachment_info`` function takes an ``email.message.Message`` object
|
||||
as input and returns the a list of dictionaries containing the attachment information,
|
||||
such as ``filename``, ``size``, ``payload``, etc. The attachment is saved to the ``output_dir``
|
||||
if specified.
|
||||
|
||||
.. code:: python
|
||||
|
||||
import email
|
||||
from unstructured.partition.email import extract_attachment_info
|
||||
|
||||
with open("example-docs/fake-email-attachment.eml", "r") as f:
|
||||
msg = email.message_from_file(f)
|
||||
attachment_info = extract_attachment_info(msg, output_dir="example-docs")
|
||||
|
||||
|
||||
``is_bulleted_text``
|
||||
----------------------
|
||||
|
50
example-docs/fake-email-attachment.eml
Normal file
50
example-docs/fake-email-attachment.eml
Normal file
@ -0,0 +1,50 @@
|
||||
MIME-Version: 1.0
|
||||
Date: Fri, 23 Dec 2022 12:08:48 -0600
|
||||
Message-ID: <CAPgNNXSzLVJ-d1OCX_TjFgJU7ugtQrjFybPtAMmmYZzphxNFYg@mail.gmail.com>
|
||||
Subject: Fake email with attachment
|
||||
From: Mallori Harrell <mallori@unstructured.io>
|
||||
To: Mallori Harrell <mallori@unstructured.io>
|
||||
Content-Type: multipart/mixed; boundary="0000000000005d654405f082adb7"
|
||||
|
||||
--0000000000005d654405f082adb7
|
||||
Content-Type: multipart/alternative; boundary="0000000000005d654205f082adb5"
|
||||
|
||||
--0000000000005d654205f082adb5
|
||||
Content-Type: text/plain; charset="UTF-8"
|
||||
|
||||
Hello!
|
||||
|
||||
Here's the attachments!
|
||||
|
||||
It includes:
|
||||
|
||||
- Lots of whitespace
|
||||
- Little to no content
|
||||
- and is a quick read
|
||||
|
||||
Best,
|
||||
|
||||
Mallori
|
||||
|
||||
--0000000000005d654205f082adb5
|
||||
Content-Type: text/html; charset="UTF-8"
|
||||
Content-Transfer-Encoding: quoted-printable
|
||||
|
||||
<div dir=3D"ltr">Hello!=C2=A0<div><br></div><div>Here's the attachments=
|
||||
!</div><div><br></div><div>It includes:</div><div><ul><li style=3D"margin-l=
|
||||
eft:15px">Lots of whitespace</li><li style=3D"margin-left:15px">Little=C2=
|
||||
=A0to no content</li><li style=3D"margin-left:15px">and is a quick read</li=
|
||||
></ul><div>Best,</div></div><div><br></div><div>Mallori</div><div dir=3D"lt=
|
||||
r" class=3D"gmail_signature" data-smartmail=3D"gmail_signature"><div dir=3D=
|
||||
"ltr"><div><div><br></div></div></div></div></div>
|
||||
|
||||
--0000000000005d654205f082adb5--
|
||||
--0000000000005d654405f082adb7
|
||||
Content-Type: text/plain; charset="US-ASCII"; name="fake-attachment.txt"
|
||||
Content-Disposition: attachment; filename="fake-attachment.txt"
|
||||
Content-Transfer-Encoding: base64
|
||||
X-Attachment-Id: f_lc0tto5j0
|
||||
Content-ID: <f_lc0tto5j0>
|
||||
|
||||
SGV5IHRoaXMgaXMgYSBmYWtlIGF0dGFjaG1lbnQh
|
||||
--0000000000005d654405f082adb7--
|
@ -1,9 +1,10 @@
|
||||
import email
|
||||
import os
|
||||
import pathlib
|
||||
import pytest
|
||||
|
||||
from unstructured.documents.elements import NarrativeText, Title, ListItem
|
||||
from unstructured.partition.email import partition_email
|
||||
from unstructured.partition.email import partition_email, extract_attachment_info
|
||||
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
@ -16,6 +17,10 @@ EXPECTED_OUTPUT = [
|
||||
ListItem(text="Violets are blue"),
|
||||
]
|
||||
|
||||
ATTACH_EXPECTED_OUTPUT = [
|
||||
{"filename": "fake-attachment.txt", "payload": b"Hey this is a fake attachment!"}
|
||||
]
|
||||
|
||||
|
||||
def test_partition_email_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
|
||||
@ -41,6 +46,15 @@ def test_partition_email_from_text():
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_extract_attachment_info():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
|
||||
with open(filename, "r") as f:
|
||||
msg = email.message_from_file(f)
|
||||
attachment_info = extract_attachment_info(msg)
|
||||
assert len(attachment_info) > 0
|
||||
assert attachment_info == ATTACH_EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_email_raises_with_none_specified():
|
||||
with pytest.raises(ValueError):
|
||||
partition_email()
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.3.5-dev3" # pragma: no cover
|
||||
__version__ = "0.3.5-dev4" # pragma: no cover
|
||||
|
@ -1,5 +1,6 @@
|
||||
import email
|
||||
import sys
|
||||
from email.message import Message
|
||||
from typing import Dict, IO, List, Optional
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
@ -7,7 +8,7 @@ if sys.version_info < (3, 8):
|
||||
else:
|
||||
from typing import Final
|
||||
|
||||
from unstructured.cleaners.core import replace_mime_encodings
|
||||
from unstructured.cleaners.core import replace_mime_encodings, clean_extra_whitespace
|
||||
from unstructured.documents.elements import Element, Text
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
@ -15,6 +16,36 @@ from unstructured.partition.html import partition_html
|
||||
VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html"]
|
||||
|
||||
|
||||
def extract_attachment_info(
|
||||
message: Message, output_dir: Optional[str] = None
|
||||
) -> List[Dict[str, str]]:
|
||||
list_attachments = []
|
||||
attachment_info = {}
|
||||
for part in message.walk():
|
||||
if "content-disposition" in part:
|
||||
cdisp = part["content-disposition"].split(";")
|
||||
cdisp = [clean_extra_whitespace(item) for item in cdisp]
|
||||
|
||||
for item in cdisp:
|
||||
if item.lower() == "attachment":
|
||||
continue
|
||||
key, value = item.split("=")
|
||||
key = clean_extra_whitespace(key.replace('"', ""))
|
||||
value = clean_extra_whitespace(value.replace('"', ""))
|
||||
attachment_info[clean_extra_whitespace(key)] = clean_extra_whitespace(value)
|
||||
attachment_info["payload"] = part.get_payload(decode=True)
|
||||
list_attachments.append(attachment_info)
|
||||
|
||||
for attachment in list_attachments:
|
||||
if output_dir:
|
||||
filename = output_dir + "/" + attachment["filename"]
|
||||
with open(filename, "wb") as f:
|
||||
# mypy wants to just us `w` when opening the file but this
|
||||
# causes an error since the payloads are bytes not str
|
||||
f.write(attachment["payload"]) # type: ignore
|
||||
return list_attachments
|
||||
|
||||
|
||||
def partition_email(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
|
Loading…
x
Reference in New Issue
Block a user