fix: allow partition_email to process emails with no content (#797)

* version and changelog

* ingest-test-fixtures-update
This commit is contained in:
Matt Robinson 2023-06-22 12:52:27 -04:00 committed by GitHub
parent 8683e2695c
commit 901ef16835
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 9 additions and 7 deletions

View File

@ -1,4 +1,4 @@
## 0.7.8-dev0
## 0.7.8-dev1
### Enhancements
@ -8,6 +8,7 @@
### Fixes
* `partition_email` now works if there is no message content
* Updates the `"fast"` strategy for `partition_pdf` so that it's able to recursively
* Adds recursive functionality to all fsspec connectors
* Adds generic --recursive ingest flag

View File

@ -322,7 +322,7 @@ def test_convert_to_iso_8601(time, expected):
assert iso_time == expected
def test_partition_email_raises_with_no_html_content():
def test_partition_email_still_works_with_no_content():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "email-no-html-content-1.eml")
with pytest.raises(ValueError):
partition_email(filename=filename)
elements = partition_email(filename=filename)
assert elements == []

View File

@ -1 +1 @@
__version__ = "0.7.8-dev0" # pragma: no cover
__version__ = "0.7.8-dev1" # pragma: no cover

View File

@ -283,9 +283,9 @@ def partition_email(
content = content_map.get(content_source, "")
if not content:
raise ValueError(f"{content_source} content not found in email")
elements = []
if content_source == "text/html":
elif content_source == "text/html":
# NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
# looks like the following, resulting in extraneous "=" characters in the output if
# you don't clean it up
@ -316,6 +316,7 @@ def partition_email(
break
except (UnicodeDecodeError, UnicodeError):
continue
elif content_source == "text/plain":
list_content = split_by_paragraph(content)
elements = partition_text(text=content, encoding=encoding)