qued 55272eeceb
enhancement: filetype in metadata (#583)
Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata.

Tests are added to make sure:

* When partition is used, any content type or auto file type detection will override file-specific partition function metadata
* Both auto and file-specific partitioning gives the desired filetype metadata

Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
2023-05-15 13:23:19 -05:00

248 lines
8.2 KiB
Python

import datetime
import email
import os
import pathlib
import pytest
from unstructured.documents.elements import (
ElementMetadata,
Image,
ListItem,
NarrativeText,
Title,
)
from unstructured.documents.email_elements import (
MetaData,
ReceivedInfo,
Recipient,
Sender,
Subject,
)
from unstructured.partition.email import (
convert_to_iso_8601,
extract_attachment_info,
partition_email,
partition_email_header,
)
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXPECTED_OUTPUT = [
NarrativeText(text="This is a test email to use for unit tests."),
Title(text="Important points:"),
ListItem(text="Roses are red"),
ListItem(text="Violets are blue"),
]
IMAGE_EXPECTED_OUTPUT = [
NarrativeText(text="This is a test email to use for unit tests."),
Title(text="Important points:"),
NarrativeText(text="hello this is our logo."),
Image(text="unstructured_logo.png"),
ListItem(text="Roses are red"),
ListItem(text="Violets are blue"),
]
RECEIVED_HEADER_OUTPUT = [
ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="00.0.0.00"),
ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="ba23::58b5:2236:45g2:88h2"),
ReceivedInfo(
name="received_datetimetz",
text="2023-02-20 10:03:18+12:00",
datestamp=datetime.datetime(
2023,
2,
20,
10,
3,
18,
tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)),
),
),
MetaData(name="MIME-Version", text="1.0"),
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
MetaData(
name="Message-ID",
text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
),
Subject(text="Test Email"),
Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
MetaData(
name="Content-Type",
text='multipart/alternative; boundary="00000000000095c9b205eff92630"',
),
]
HEADER_EXPECTED_OUTPUT = [
MetaData(name="MIME-Version", text="1.0"),
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
MetaData(
name="Message-ID",
text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
),
Subject(text="Test Email"),
Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
MetaData(
name="Content-Type",
text='multipart/alternative; boundary="00000000000095c9b205eff92630"',
),
]
ALL_EXPECTED_OUTPUT = HEADER_EXPECTED_OUTPUT + EXPECTED_OUTPUT
ATTACH_EXPECTED_OUTPUT = [
{"filename": "fake-attachment.txt", "payload": b"Hey this is a fake attachment!"},
]
def test_partition_email_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
elements = partition_email(filename=filename)
assert len(elements) > 0
assert elements == EXPECTED_OUTPUT
def test_partition_email_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
with open(filename) as f:
elements = partition_email(file=f)
assert len(elements) > 0
assert elements == EXPECTED_OUTPUT
def test_partition_email_from_file_rb():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
with open(filename, "rb") as f:
elements = partition_email(file=f)
assert len(elements) > 0
assert elements == EXPECTED_OUTPUT
def test_partition_email_from_text_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.txt")
with open(filename) as f:
elements = partition_email(file=f, content_source="text/plain")
assert len(elements) > 0
assert elements == EXPECTED_OUTPUT
def test_partition_email_from_text_file_with_headers():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.txt")
with open(filename) as f:
elements = partition_email(file=f, content_source="text/plain", include_headers=True)
assert len(elements) > 0
assert elements == ALL_EXPECTED_OUTPUT
def test_partition_email_from_text():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
with open(filename) as f:
text = f.read()
elements = partition_email(text=text)
assert len(elements) > 0
assert elements == EXPECTED_OUTPUT
def test_partition_email_from_text_work_with_empty_string():
assert partition_email(text="") == []
def test_partition_email_from_filename_with_embedded_image():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-image-embedded.eml")
elements = partition_email(filename=filename, content_source="text/plain")
assert len(elements) > 0
assert elements == IMAGE_EXPECTED_OUTPUT
def test_partition_email_header():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
with open(filename) as f:
msg = email.message_from_file(f)
elements = partition_email_header(msg)
assert len(elements) > 0
assert elements == RECEIVED_HEADER_OUTPUT
def test_partition_email_has_metadata():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
elements = partition_email(filename=filename)
assert len(elements) > 0
assert elements[0].metadata == ElementMetadata(
filename=filename,
date="2022-12-16T17:04:16-05:00",
page_number=1,
url=None,
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
sent_to=["Matthew Robinson <mrobinson@unstructured.io>"],
subject="Test Email",
filetype="message/rfc822",
)
expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
assert elements[0].metadata.get_date() == expected_dt
def test_extract_email_text_matches_html():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
elements_from_text = partition_email(filename=filename, content_source="text/plain")
elements_from_html = partition_email(filename=filename, content_source="text/html")
assert len(elements_from_text) == len(elements_from_html)
# NOTE(robinson) - checking each individually is necessary because the text/html returns
# HTMLTitle, HTMLNarrativeText, etc
for i, element in enumerate(elements_from_text):
assert element == elements_from_text[i]
def test_extract_attachment_info():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
with open(filename) as f:
msg = email.message_from_file(f)
attachment_info = extract_attachment_info(msg)
assert len(attachment_info) > 0
assert attachment_info == ATTACH_EXPECTED_OUTPUT
def test_partition_email_raises_with_none_specified():
with pytest.raises(ValueError):
partition_email()
def test_partition_email_raises_with_too_many_specified():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
with open(filename) as f:
text = f.read()
with pytest.raises(ValueError):
partition_email(filename=filename, text=text)
def test_partition_email_raises_with_invalid_content_type():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
with pytest.raises(ValueError):
partition_email(filename=filename, content_source="application/json")
def test_partition_email_processes_fake_email_with_header():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
elements = partition_email(filename=filename)
assert len(elements) > 0
@pytest.mark.parametrize(
(("time", "expected")),
[
("Thu, 4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"),
("Thu, 4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"),
("Thu, 4 May 2023 02:32:49 +0000 (UTC)", "2023-05-04T02:32:49+00:00"),
("Thursday 5/3/2023 02:32:49", None),
],
)
def test_convert_to_iso_8601(time, expected):
iso_time = convert_to_iso_8601(time)
assert iso_time == expected