diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ef29e898..88546bd41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ -## 0.4.2-dev0 +## 0.4.2 + * Added `partition_image` to process documents in an image format. +* Fixed utf-8 encoding error in `partition_email` with attachments for `text/html` ## 0.4.1 diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index 3bbf28c8d..282e58995 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -122,6 +122,18 @@ def test_partition_email_header(): assert elements == HEADER_EXPECTED_OUTPUT +def test_extract_email_text_matches_html(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml") + elements_from_text = partition_email(filename=filename, content_source="text/plain") + elements_from_html = partition_email(filename=filename, content_source="text/html") + + assert len(elements_from_text) == len(elements_from_html) + # NOTE(robinson) - checking each individually is necessary because the text/html returns + # HTMLTitle, HTMLNarrativeText, etc + for i, element in enumerate(elements_from_text): + assert element == elements_from_text[i] + + def test_extract_attachment_info(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml") with open(filename, "r") as f: diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 299e3314f..35536e084 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.4.2-dev0" # pragma: no cover +__version__ = "0.4.2" # pragma: no cover diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 77bbd505b..362793bc4 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -181,30 +181,35 @@ def partition_email( else: raise ValueError("Only one of filename, file, or text can be specified.") - content_map: Dict[str, str] = { - part.get_content_type(): part.get_payload() for part in msg.walk() - } + content_map: Dict[str, str] = {} + for part in msg.walk(): + # NOTE(robinson) - content dispostiion is None for the content of the email itself. + # Other dispositions include "attachment" for attachments + if part.get_content_disposition() is not None: + continue + content_type = part.get_content_type() + content_map[content_type] = part.get_payload() content = content_map.get(content_source, "") if not content: raise ValueError(f"{content_source} content not found in email") - # NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that - # looks like the following, resulting in extraneous "=" characters in the output if - # you don't clean it up - #