mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	enhancement: improve text clearing process in email partitioning (#3422)
				
					
				
			### Summary
Currently, the email partitioner removes only `=\n` characters during
the clearing process. However, email content sometimes contains `=\r\n`
characters, especially when read from file-like objects such as
`SpooledTemporaryFile` (the file type used in our API). This PR updates
the email partitioner to remove both `=\n` and `=\r\n` characters during
the clearing process.
### Testing
```
filename = "example-docs/eml/family-day.eml"
elements = partition_email(
    filename=filename,
)
print(f"From filename: {elements[3].text}")
with open(filename, "rb") as test_file:
    spooled_temp_file = tempfile.SpooledTemporaryFile()
    spooled_temp_file.write(test_file.read())
    spooled_temp_file.seek(0)
    elements = partition_email(file=spooled_temp_file)
    print(f"From spooled_temp_file: {elements[3].text}")
```
**Results:**
- on `main`
```
From filename: Make sure to RSVP!
From spooled_temp_file: Make sure to = RSVP!
```
- on `PR`
```
From filename: Make sure to RSVP!
From spooled_temp_file: Make sure to RSVP!
```
			
			
This commit is contained in:
		
							parent
							
								
									1df7908f03
								
							
						
					
					
						commit
						ec59abfabc
					
				| @ -1,7 +1,8 @@ | |||||||
| ## 0.15.0-dev16 | ## 0.15.0 | ||||||
| 
 | 
 | ||||||
| ### Enhancements | ### Enhancements | ||||||
| 
 | 
 | ||||||
|  | * **Improve text clearing process in email partitioning.** Updated the email partitioner to remove both `=\n` and `=\r\n` characters during the clearing process. Previously, only `=\n` characters were removed.    | ||||||
| * **Bump unstructured.paddleocr to 2.8.0.1.** | * **Bump unstructured.paddleocr to 2.8.0.1.** | ||||||
| * **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `<p>`, `<div>`) nested inside a phrasing element (e.g. `<strong>` or `<cite>`). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation. | * **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `<p>`, `<div>`) nested inside a phrasing element (e.g. `<strong>` or `<cite>`). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation. | ||||||
| * **Install rewritten HTML parser to fix 12 existing bugs and provide headroom for refinement and growth.** A rewritten HTML parser resolves a collection of outstanding bugs with HTML partitioning and provides a firm foundation for further elaborating that important partitioner. | * **Install rewritten HTML parser to fix 12 existing bugs and provide headroom for refinement and growth.** A rewritten HTML parser resolves a collection of outstanding bugs with HTML partitioning and provides a firm foundation for further elaborating that important partitioner. | ||||||
|  | |||||||
							
								
								
									
										39
									
								
								example-docs/eml/family-day.eml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								example-docs/eml/family-day.eml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,39 @@ | |||||||
|  | MIME-Version: 1.0 | ||||||
|  | Date: Wed, 21 Dec 2022 10:28:53 -0600 | ||||||
|  | Message-ID: <CAPgNNXQKR=o6AsOTr74VMrsDNhUJW0Keou9n3vLa2UO_Nv+tZw@mail.gmail.com> | ||||||
|  | Subject: Family Day | ||||||
|  | From: Mallori Harrell <mallori@unstructured.io> | ||||||
|  | To: Mallori Harrell <mallori@unstructured.io> | ||||||
|  | Content-Type: multipart/alternative; boundary="0000000000005c115405f0590ce4" | ||||||
|  | 
 | ||||||
|  | --0000000000005c115405f0590ce4 | ||||||
|  | Content-Type: text/plain; charset="UTF-8" | ||||||
|  | 
 | ||||||
|  | Hi All, | ||||||
|  | 
 | ||||||
|  | Get excited for our first annual family day! | ||||||
|  | 
 | ||||||
|  | There will be face painting, a petting zoo, funnel cake and more. | ||||||
|  | 
 | ||||||
|  | Make sure to RSVP! | ||||||
|  | 
 | ||||||
|  | Best. | ||||||
|  | 
 | ||||||
|  | --  | ||||||
|  | Mallori Harrell | ||||||
|  | Unstructured Technologies | ||||||
|  | Data Scientist | ||||||
|  | 
 | ||||||
|  | --0000000000005c115405f0590ce4 | ||||||
|  | Content-Type: text/html; charset="UTF-8" | ||||||
|  | Content-Transfer-Encoding: quoted-printable | ||||||
|  | 
 | ||||||
|  | <div dir=3D"ltr">Hi All,<div><br></div><div>Get excited for our first annua= | ||||||
|  | l family day!=C2=A0</div><div><br></div><div>There will be face painting, = | ||||||
|  | a petting zoo, funnel cake and more.</div><div><br></div><div>Make sure to = | ||||||
|  | RSVP!</div><div><br></div><div>Best.<br clear=3D"all"><div><br></div>-- <br= | ||||||
|  | ><div dir=3D"ltr" class=3D"gmail_signature" data-smartmail=3D"gmail_signatu= | ||||||
|  | re"><div dir=3D"ltr">Mallori Harrell<div>Unstructured Technologies<br><div>= | ||||||
|  | Data Scientist</div><div><br></div></div></div></div></div></div> | ||||||
|  | 
 | ||||||
|  | --0000000000005c115405f0590ce4-- | ||||||
| @ -2,6 +2,7 @@ import datetime | |||||||
| import email | import email | ||||||
| import os | import os | ||||||
| import pathlib | import pathlib | ||||||
|  | import tempfile | ||||||
| 
 | 
 | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| @ -230,6 +231,17 @@ def test_partition_email_from_file_rb_default_encoding(filename, expected_output | |||||||
|         assert element.metadata.filename is None |         assert element.metadata.filename is None | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_partition_email_from_spooled_temp_file(): | ||||||
|  |     filename = example_doc_path("eml/family-day.eml") | ||||||
|  |     with open(filename, "rb") as test_file: | ||||||
|  |         spooled_temp_file = tempfile.SpooledTemporaryFile() | ||||||
|  |         spooled_temp_file.write(test_file.read()) | ||||||
|  |         spooled_temp_file.seek(0) | ||||||
|  |         elements = partition_email(file=spooled_temp_file) | ||||||
|  |         assert len(elements) == 9 | ||||||
|  |         assert elements[3].text == "Make sure to RSVP!" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_partition_email_from_text_file(): | def test_partition_email_from_text_file(): | ||||||
|     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") | ||||||
|     with open(filename) as f: |     with open(filename) as f: | ||||||
|  | |||||||
| @ -1 +1 @@ | |||||||
| __version__ = "0.15.0-dev16"  # pragma: no cover | __version__ = "0.15.0"  # pragma: no cover | ||||||
|  | |||||||
| @ -416,8 +416,8 @@ def partition_email( | |||||||
|         #    <li>Item 1</li>= |         #    <li>Item 1</li>= | ||||||
|         #    <li>Item 2<li>= |         #    <li>Item 2<li>= | ||||||
|         # </ul> |         # </ul> | ||||||
|         list_content = content.split("=\n") | 
 | ||||||
|         content = "".join(list_content) |         content = content.replace("=\n", "").replace("=\r\n", "") | ||||||
|         elements = partition_html( |         elements = partition_html( | ||||||
|             text=content, |             text=content, | ||||||
|             include_metadata=False, |             include_metadata=False, | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Christine Straub
						Christine Straub