mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 15:11:30 +00:00
enhancement: improve text clearing process in email
partitioning (#3422)
### Summary Currently, the email partitioner removes only `=\n` characters during the clearing process. However, email content sometimes contains `=\r\n` characters, especially when read from file-like objects such as `SpooledTemporaryFile` (the file type used in our API). This PR updates the email partitioner to remove both `=\n` and `=\r\n` characters during the clearing process. ### Testing ``` filename = "example-docs/eml/family-day.eml" elements = partition_email( filename=filename, ) print(f"From filename: {elements[3].text}") with open(filename, "rb") as test_file: spooled_temp_file = tempfile.SpooledTemporaryFile() spooled_temp_file.write(test_file.read()) spooled_temp_file.seek(0) elements = partition_email(file=spooled_temp_file) print(f"From spooled_temp_file: {elements[3].text}") ``` **Results:** - on `main` ``` From filename: Make sure to RSVP! From spooled_temp_file: Make sure to = RSVP! ``` - on `PR` ``` From filename: Make sure to RSVP! From spooled_temp_file: Make sure to RSVP! ```
This commit is contained in:
parent
1df7908f03
commit
ec59abfabc
@ -1,7 +1,8 @@
|
|||||||
## 0.15.0-dev16
|
## 0.15.0
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
* **Improve text clearing process in email partitioning.** Updated the email partitioner to remove both `=\n` and `=\r\n` characters during the clearing process. Previously, only `=\n` characters were removed.
|
||||||
* **Bump unstructured.paddleocr to 2.8.0.1.**
|
* **Bump unstructured.paddleocr to 2.8.0.1.**
|
||||||
* **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `<p>`, `<div>`) nested inside a phrasing element (e.g. `<strong>` or `<cite>`). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation.
|
* **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `<p>`, `<div>`) nested inside a phrasing element (e.g. `<strong>` or `<cite>`). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation.
|
||||||
* **Install rewritten HTML parser to fix 12 existing bugs and provide headroom for refinement and growth.** A rewritten HTML parser resolves a collection of outstanding bugs with HTML partitioning and provides a firm foundation for further elaborating that important partitioner.
|
* **Install rewritten HTML parser to fix 12 existing bugs and provide headroom for refinement and growth.** A rewritten HTML parser resolves a collection of outstanding bugs with HTML partitioning and provides a firm foundation for further elaborating that important partitioner.
|
||||||
|
39
example-docs/eml/family-day.eml
Normal file
39
example-docs/eml/family-day.eml
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
MIME-Version: 1.0
|
||||||
|
Date: Wed, 21 Dec 2022 10:28:53 -0600
|
||||||
|
Message-ID: <CAPgNNXQKR=o6AsOTr74VMrsDNhUJW0Keou9n3vLa2UO_Nv+tZw@mail.gmail.com>
|
||||||
|
Subject: Family Day
|
||||||
|
From: Mallori Harrell <mallori@unstructured.io>
|
||||||
|
To: Mallori Harrell <mallori@unstructured.io>
|
||||||
|
Content-Type: multipart/alternative; boundary="0000000000005c115405f0590ce4"
|
||||||
|
|
||||||
|
--0000000000005c115405f0590ce4
|
||||||
|
Content-Type: text/plain; charset="UTF-8"
|
||||||
|
|
||||||
|
Hi All,
|
||||||
|
|
||||||
|
Get excited for our first annual family day!
|
||||||
|
|
||||||
|
There will be face painting, a petting zoo, funnel cake and more.
|
||||||
|
|
||||||
|
Make sure to RSVP!
|
||||||
|
|
||||||
|
Best.
|
||||||
|
|
||||||
|
--
|
||||||
|
Mallori Harrell
|
||||||
|
Unstructured Technologies
|
||||||
|
Data Scientist
|
||||||
|
|
||||||
|
--0000000000005c115405f0590ce4
|
||||||
|
Content-Type: text/html; charset="UTF-8"
|
||||||
|
Content-Transfer-Encoding: quoted-printable
|
||||||
|
|
||||||
|
<div dir=3D"ltr">Hi All,<div><br></div><div>Get excited for our first annua=
|
||||||
|
l family day!=C2=A0</div><div><br></div><div>There will be face painting, =
|
||||||
|
a petting zoo, funnel cake and more.</div><div><br></div><div>Make sure to =
|
||||||
|
RSVP!</div><div><br></div><div>Best.<br clear=3D"all"><div><br></div>-- <br=
|
||||||
|
><div dir=3D"ltr" class=3D"gmail_signature" data-smartmail=3D"gmail_signatu=
|
||||||
|
re"><div dir=3D"ltr">Mallori Harrell<div>Unstructured Technologies<br><div>=
|
||||||
|
Data Scientist</div><div><br></div></div></div></div></div></div>
|
||||||
|
|
||||||
|
--0000000000005c115405f0590ce4--
|
@ -2,6 +2,7 @@ import datetime
|
|||||||
import email
|
import email
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
import tempfile
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -230,6 +231,17 @@ def test_partition_email_from_file_rb_default_encoding(filename, expected_output
|
|||||||
assert element.metadata.filename is None
|
assert element.metadata.filename is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_email_from_spooled_temp_file():
|
||||||
|
filename = example_doc_path("eml/family-day.eml")
|
||||||
|
with open(filename, "rb") as test_file:
|
||||||
|
spooled_temp_file = tempfile.SpooledTemporaryFile()
|
||||||
|
spooled_temp_file.write(test_file.read())
|
||||||
|
spooled_temp_file.seek(0)
|
||||||
|
elements = partition_email(file=spooled_temp_file)
|
||||||
|
assert len(elements) == 9
|
||||||
|
assert elements[3].text == "Make sure to RSVP!"
|
||||||
|
|
||||||
|
|
||||||
def test_partition_email_from_text_file():
|
def test_partition_email_from_text_file():
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt")
|
||||||
with open(filename) as f:
|
with open(filename) as f:
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.15.0-dev16" # pragma: no cover
|
__version__ = "0.15.0" # pragma: no cover
|
||||||
|
@ -416,8 +416,8 @@ def partition_email(
|
|||||||
# <li>Item 1</li>=
|
# <li>Item 1</li>=
|
||||||
# <li>Item 2<li>=
|
# <li>Item 2<li>=
|
||||||
# </ul>
|
# </ul>
|
||||||
list_content = content.split("=\n")
|
|
||||||
content = "".join(list_content)
|
content = content.replace("=\n", "").replace("=\r\n", "")
|
||||||
elements = partition_html(
|
elements = partition_html(
|
||||||
text=content,
|
text=content,
|
||||||
include_metadata=False,
|
include_metadata=False,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user