mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-15 17:23:54 +00:00
fix: resolves UnicodeDecodeError in partition_email for emails with attachments (#158)
* split emails by \n= * added test for equivalence betweent html and plain text * changelog and bump version * add check for content disposition
This commit is contained in:
parent
7ed5f71e30
commit
9c3c14e94d
@ -1,5 +1,7 @@
|
||||
## 0.4.2-dev0
|
||||
## 0.4.2
|
||||
|
||||
* Added `partition_image` to process documents in an image format.
|
||||
* Fixed utf-8 encoding error in `partition_email` with attachments for `text/html`
|
||||
|
||||
|
||||
## 0.4.1
|
||||
|
||||
@ -122,6 +122,18 @@ def test_partition_email_header():
|
||||
assert elements == HEADER_EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_extract_email_text_matches_html():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
|
||||
elements_from_text = partition_email(filename=filename, content_source="text/plain")
|
||||
elements_from_html = partition_email(filename=filename, content_source="text/html")
|
||||
|
||||
assert len(elements_from_text) == len(elements_from_html)
|
||||
# NOTE(robinson) - checking each individually is necessary because the text/html returns
|
||||
# HTMLTitle, HTMLNarrativeText, etc
|
||||
for i, element in enumerate(elements_from_text):
|
||||
assert element == elements_from_text[i]
|
||||
|
||||
|
||||
def test_extract_attachment_info():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
|
||||
with open(filename, "r") as f:
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.4.2-dev0" # pragma: no cover
|
||||
__version__ = "0.4.2" # pragma: no cover
|
||||
|
||||
@ -181,30 +181,35 @@ def partition_email(
|
||||
else:
|
||||
raise ValueError("Only one of filename, file, or text can be specified.")
|
||||
|
||||
content_map: Dict[str, str] = {
|
||||
part.get_content_type(): part.get_payload() for part in msg.walk()
|
||||
}
|
||||
content_map: Dict[str, str] = {}
|
||||
for part in msg.walk():
|
||||
# NOTE(robinson) - content dispostiion is None for the content of the email itself.
|
||||
# Other dispositions include "attachment" for attachments
|
||||
if part.get_content_disposition() is not None:
|
||||
continue
|
||||
content_type = part.get_content_type()
|
||||
content_map[content_type] = part.get_payload()
|
||||
|
||||
content = content_map.get(content_source, "")
|
||||
if not content:
|
||||
raise ValueError(f"{content_source} content not found in email")
|
||||
|
||||
# NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
|
||||
# looks like the following, resulting in extraneous "=" characters in the output if
|
||||
# you don't clean it up
|
||||
# <ul> =
|
||||
# <li>Item 1</li>=
|
||||
# <li>Item 2<li>=
|
||||
# </ul>
|
||||
list_content = split_by_paragraph(content)
|
||||
|
||||
if content_source == "text/html":
|
||||
# NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
|
||||
# looks like the following, resulting in extraneous "=" characters in the output if
|
||||
# you don't clean it up
|
||||
# <ul> =
|
||||
# <li>Item 1</li>=
|
||||
# <li>Item 2<li>=
|
||||
# </ul>
|
||||
list_content = content.split("=\n")
|
||||
content = "".join(list_content)
|
||||
elements = partition_html(text=content)
|
||||
for element in elements:
|
||||
if isinstance(element, Text):
|
||||
element.apply(replace_mime_encodings)
|
||||
elif content_source == "text/plain":
|
||||
list_content = split_by_paragraph(content)
|
||||
elements = partition_text(text=content)
|
||||
|
||||
for idx, element in enumerate(elements):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user