diff --git a/CHANGELOG.md b/CHANGELOG.md index 92e3b1009..d451a6ba7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.15.0-dev16 +## 0.15.0 ### Enhancements +* **Improve text clearing process in email partitioning.** Updated the email partitioner to remove both `=\n` and `=\r\n` characters during the clearing process. Previously, only `=\n` characters were removed. * **Bump unstructured.paddleocr to 2.8.0.1.** * **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `

`, `

`) nested inside a phrasing element (e.g. `` or ``). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation. * **Install rewritten HTML parser to fix 12 existing bugs and provide headroom for refinement and growth.** A rewritten HTML parser resolves a collection of outstanding bugs with HTML partitioning and provides a firm foundation for further elaborating that important partitioner. diff --git a/example-docs/eml/family-day.eml b/example-docs/eml/family-day.eml new file mode 100644 index 000000000..2c0e83111 --- /dev/null +++ b/example-docs/eml/family-day.eml @@ -0,0 +1,39 @@ +MIME-Version: 1.0 +Date: Wed, 21 Dec 2022 10:28:53 -0600 +Message-ID: +Subject: Family Day +From: Mallori Harrell +To: Mallori Harrell +Content-Type: multipart/alternative; boundary="0000000000005c115405f0590ce4" + +--0000000000005c115405f0590ce4 +Content-Type: text/plain; charset="UTF-8" + +Hi All, + +Get excited for our first annual family day! + +There will be face painting, a petting zoo, funnel cake and more. + +Make sure to RSVP! + +Best. + +-- +Mallori Harrell +Unstructured Technologies +Data Scientist + +--0000000000005c115405f0590ce4 +Content-Type: text/html; charset="UTF-8" +Content-Transfer-Encoding: quoted-printable + +
Hi All,

Get excited for our first annua= +l family day!=C2=A0

There will be face painting, = +a petting zoo, funnel cake and more.

Make sure to = +RSVP!

Best.

--
Mallori Harrell
Unstructured Technologies
= +Data Scientist

+ +--0000000000005c115405f0590ce4-- \ No newline at end of file diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index 52b0214ad..1b3f7cad9 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -2,6 +2,7 @@ import datetime import email import os import pathlib +import tempfile import pytest @@ -230,6 +231,17 @@ def test_partition_email_from_file_rb_default_encoding(filename, expected_output assert element.metadata.filename is None +def test_partition_email_from_spooled_temp_file(): + filename = example_doc_path("eml/family-day.eml") + with open(filename, "rb") as test_file: + spooled_temp_file = tempfile.SpooledTemporaryFile() + spooled_temp_file.write(test_file.read()) + spooled_temp_file.seek(0) + elements = partition_email(file=spooled_temp_file) + assert len(elements) == 9 + assert elements[3].text == "Make sure to RSVP!" + + def test_partition_email_from_text_file(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") with open(filename) as f: diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 11dfa8975..25f64ffb0 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.0-dev16" # pragma: no cover +__version__ = "0.15.0" # pragma: no cover diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index e47bab242..bd53b26e3 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -416,8 +416,8 @@ def partition_email( #
  • Item 1
  • = #
  • Item 2
  • = # - list_content = content.split("=\n") - content = "".join(list_content) + + content = content.replace("=\n", "").replace("=\r\n", "") elements = partition_html( text=content, include_metadata=False,