mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 07:05:20 +00:00

Adds filetype to metadata. I've created a decorator that adds metadata to a list of elements. This replaces some existing boilerplate, but also adds a nice layered approach to determining the filetype. Since in some cases several partition_ functions handle a file in various formats, the partition function that first touches a file will be the last one to alter its metadata, resulting in the correct filetype metadata. Tests are added to make sure: * When partition is used, any content type or auto file type detection will override file-specific partition function metadata * Both auto and file-specific partitioning gives the desired filetype metadata Won't work with image files currently... the plumbing is there to use the image format inferred by PIL, but we need to pull in the fix from this PR to unstructured-inference .
92 lines
2.8 KiB
Python
92 lines
2.8 KiB
Python
import os
|
|
import pathlib
|
|
|
|
import msg_parser
|
|
import pytest
|
|
|
|
from unstructured.documents.elements import (
|
|
ElementMetadata,
|
|
ListItem,
|
|
NarrativeText,
|
|
Title,
|
|
)
|
|
from unstructured.partition.msg import extract_msg_attachment_info, partition_msg
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
|
|
|
EXPECTED_MSG_OUTPUT = [
|
|
NarrativeText(text="This is a test email to use for unit tests."),
|
|
Title(text="Important points:"),
|
|
ListItem(text="Roses are red"),
|
|
ListItem(text="Violets are blue"),
|
|
]
|
|
|
|
ATTACH_EXPECTED_OUTPUT = [
|
|
{
|
|
"filename": "fake-attachment.txt",
|
|
"extension": ".txt",
|
|
"file_size": "unknown",
|
|
"payload": b"Hey this is a fake attachment!",
|
|
},
|
|
]
|
|
|
|
|
|
def test_partition_msg_from_filename():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
elements = partition_msg(filename=filename)
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
|
assert elements[0].metadata == ElementMetadata(
|
|
filename="fake-email.msg",
|
|
date="2022-12-16T17:04:16-05:00",
|
|
page_number=1,
|
|
url=None,
|
|
sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
|
|
sent_to=["Matthew Robinson (None)"],
|
|
subject="Test Email",
|
|
filetype="application/vnd.ms-outlook",
|
|
)
|
|
|
|
|
|
class MockMsOxMessage:
|
|
def __init__(self, filename):
|
|
self.body = "Here is an email with plain text."
|
|
|
|
|
|
def test_partition_msg_from_filename_with_text_content(monkeypatch):
|
|
monkeypatch.setattr(msg_parser, "MsOxMessage", MockMsOxMessage)
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
elements = partition_msg(filename=filename)
|
|
assert str(elements[0]) == "Here is an email with plain text."
|
|
|
|
|
|
def test_partition_msg_raises_with_missing_file():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "doesnt-exist.msg")
|
|
with pytest.raises(FileNotFoundError):
|
|
partition_msg(filename=filename)
|
|
|
|
|
|
def test_partition_msg_from_file():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
with open(filename, "rb") as f:
|
|
elements = partition_msg(file=f)
|
|
assert elements == EXPECTED_MSG_OUTPUT
|
|
|
|
|
|
def test_extract_attachment_info():
|
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.msg")
|
|
attachment_info = extract_msg_attachment_info(filename)
|
|
assert len(attachment_info) > 0
|
|
assert attachment_info == ATTACH_EXPECTED_OUTPUT
|
|
|
|
|
|
def test_partition_msg_raises_with_both_specified():
|
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
|
|
with open(filename, "rb") as f, pytest.raises(ValueError):
|
|
partition_msg(filename=filename, file=f)
|
|
|
|
|
|
def test_partition_msg_raises_with_neither():
|
|
with pytest.raises(ValueError):
|
|
partition_msg()
|