mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	feat: attachment processing for emails (#855)
* process attachments for email * add attachment processing to msg * fix up metadata for attachments * add test for processing email attachments * added test for processing msg attachments * update docs * tests for error conditions * version and changelog
This commit is contained in:
		
							parent
							
								
									92e55eb89e
								
							
						
					
					
						commit
						c581a33c8a
					
				| @ -2,6 +2,9 @@ | ||||
| 
 | ||||
| ### Enhancements | ||||
| 
 | ||||
| * `partition_email` and `partition_msg` will now process attachments if `process_attachments=True` | ||||
|   and a attachment partitioning functions is passed through with `attachment_partitioner=partition`. | ||||
| 
 | ||||
| ### Features | ||||
| 
 | ||||
| ### Fixes | ||||
|  | ||||
| @ -98,8 +98,8 @@ about the library. | ||||
| | Document Type | Partition Function | Strategies | Table Support | Options | | ||||
| | --- | --- | --- | --- | --- | | ||||
| | CSV Files (`.csv`) | `partition_csv` | N/A | Yes | None | | ||||
| | E-mails (`.eml`) | `partition_eml` | N/A | No | Encoding; Max Partition | | ||||
| | E-mails (`.msg`) | `partition_msg` | N/A | No | Encoding; Max Partition | | ||||
| | E-mails (`.eml`) | `partition_eml` | N/A | No | Encoding; Max Partition; Process Attachments | | ||||
| | E-mails (`.msg`) | `partition_msg` | N/A | No | Encoding; Max Partition; Process Attachments | | ||||
| | EPubs (`.epub`) | `partition_epub` | N/A | Yes | Include Page Breaks | | ||||
| | Excel Documents (`.xlsx`/`.xls`) | `partition_xlsx` | N/A | Yes | None | | ||||
| | HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks | | ||||
|  | ||||
| @ -273,6 +273,22 @@ the average character length for a paragraph. | ||||
| You can disable ``max_partition`` by setting it to ``None``. | ||||
| 
 | ||||
| 
 | ||||
| You can optionally partition e-mail attachments by setting ``process_attachments=True``. | ||||
| If you set ``process_attachments=True``, you'll also need to pass in a partitioning | ||||
| function to ``attachment_partitioner``. The following is an example of what the | ||||
| workflow looks like: | ||||
| 
 | ||||
| .. code:: python | ||||
| 
 | ||||
|   from unstructured.partition.auto import partition | ||||
|   from unstructured.partition.email import partition_email | ||||
| 
 | ||||
|   filename = "example-docs/eml/fake-email-attachment.eml" | ||||
|   elements = partition_email( | ||||
|     filename=filename, process_attachments=True, attachment_partitioner=partition | ||||
|   ) | ||||
| 
 | ||||
| 
 | ||||
| ``partition_epub`` | ||||
| --------------------- | ||||
| 
 | ||||
| @ -439,6 +455,22 @@ the average character length for a paragraph. | ||||
| You can disable ``max_partition`` by setting it to ``None``. | ||||
| 
 | ||||
| 
 | ||||
| You can optionally partition e-mail attachments by setting ``process_attachments=True``. | ||||
| If you set ``process_attachments=True``, you'll also need to pass in a partitioning | ||||
| function to ``attachment_partitioner``. The following is an example of what the | ||||
| workflow looks like: | ||||
| 
 | ||||
| .. code:: python | ||||
| 
 | ||||
|   from unstructured.partition.auto import partition | ||||
|   from unstructured.partition.msg import partition_msg | ||||
| 
 | ||||
|   filename = "example-docs/fake-email-attachment.msg" | ||||
|   elements = partition_msg( | ||||
|     filename=filename, process_attachments=True, attachment_partitioner=partition | ||||
|   ) | ||||
| 
 | ||||
| 
 | ||||
| ``partition_multiple_via_api`` | ||||
| ------------------------------ | ||||
| 
 | ||||
|  | ||||
| @ -25,6 +25,7 @@ from unstructured.partition.email import ( | ||||
|     partition_email, | ||||
|     partition_email_header, | ||||
| ) | ||||
| from unstructured.partition.text import partition_text | ||||
| 
 | ||||
| FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve() | ||||
| EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs", "eml") | ||||
| @ -326,3 +327,42 @@ def test_partition_email_still_works_with_no_content(): | ||||
|     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "email-no-html-content-1.eml") | ||||
|     elements = partition_email(filename=filename) | ||||
|     assert elements == [] | ||||
| 
 | ||||
| 
 | ||||
| def test_partition_email_can_process_attachments( | ||||
|     tmpdir, | ||||
|     filename="example-docs/eml/fake-email-attachment.eml", | ||||
| ): | ||||
|     with open(filename) as f: | ||||
|         msg = email.message_from_file(f) | ||||
|     extract_attachment_info(msg, output_dir=tmpdir.dirname) | ||||
|     attachment_filename = os.path.join(tmpdir.dirname, ATTACH_EXPECTED_OUTPUT[0]["filename"]) | ||||
|     attachment_elements = partition_text( | ||||
|         filename=attachment_filename, | ||||
|         metadata_filename=attachment_filename, | ||||
|     ) | ||||
|     expected_metadata = attachment_elements[0].metadata | ||||
|     expected_metadata.file_directory = None | ||||
|     expected_metadata.attached_to_filename = filename | ||||
| 
 | ||||
|     elements = partition_email( | ||||
|         filename=filename, | ||||
|         attachment_partitioner=partition_text, | ||||
|         process_attachments=True, | ||||
|     ) | ||||
| 
 | ||||
|     assert elements[0].text.startswith("Hello!") | ||||
| 
 | ||||
|     for element in elements[:-1]: | ||||
|         assert element.metadata.filename == "fake-email-attachment.eml" | ||||
|         assert element.metadata.subject == "Fake email with attachment" | ||||
| 
 | ||||
|     assert elements[-1].text == "Hey this is a fake attachment!" | ||||
|     assert elements[-1].metadata == expected_metadata | ||||
| 
 | ||||
| 
 | ||||
| def test_partition_msg_raises_with_no_partitioner( | ||||
|     filename="example-docs/eml/fake-email-attachment.eml", | ||||
| ): | ||||
|     with pytest.raises(ValueError): | ||||
|         partition_email(filename=filename, process_attachments=True) | ||||
|  | ||||
| @ -11,6 +11,7 @@ from unstructured.documents.elements import ( | ||||
|     Title, | ||||
| ) | ||||
| from unstructured.partition.msg import extract_msg_attachment_info, partition_msg | ||||
| from unstructured.partition.text import partition_text | ||||
| 
 | ||||
| DIRECTORY = pathlib.Path(__file__).parent.resolve() | ||||
| EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs") | ||||
| @ -94,3 +95,40 @@ def test_partition_msg_raises_with_both_specified(): | ||||
| def test_partition_msg_raises_with_neither(): | ||||
|     with pytest.raises(ValueError): | ||||
|         partition_msg() | ||||
| 
 | ||||
| 
 | ||||
| def test_partition_msg_can_process_attachments( | ||||
|     tmpdir, | ||||
|     filename="example-docs/fake-email-attachment.msg", | ||||
| ): | ||||
|     extract_msg_attachment_info(filename=filename, output_dir=tmpdir.dirname) | ||||
|     attachment_filename = os.path.join(tmpdir.dirname, ATTACH_EXPECTED_OUTPUT[0]["filename"]) | ||||
|     attachment_elements = partition_text( | ||||
|         filename=attachment_filename, | ||||
|         metadata_filename=attachment_filename, | ||||
|     ) | ||||
|     expected_metadata = attachment_elements[0].metadata | ||||
|     expected_metadata.file_directory = None | ||||
|     expected_metadata.attached_to_filename = filename | ||||
| 
 | ||||
|     elements = partition_msg( | ||||
|         filename=filename, | ||||
|         attachment_partitioner=partition_text, | ||||
|         process_attachments=True, | ||||
|     ) | ||||
| 
 | ||||
|     assert elements[0].text.startswith("Hello!") | ||||
| 
 | ||||
|     for element in elements[:-1]: | ||||
|         assert element.metadata.filename == "fake-email-attachment.msg" | ||||
|         assert element.metadata.subject == "Fake email with attachment" | ||||
| 
 | ||||
|     assert elements[-1].text == "Hey this is a fake attachment!" | ||||
|     assert elements[-1].metadata == expected_metadata | ||||
| 
 | ||||
| 
 | ||||
| def test_partition_msg_raises_with_no_partitioner( | ||||
|     filename="example-docs/fake-email-attachment.msg", | ||||
| ): | ||||
|     with pytest.raises(ValueError): | ||||
|         partition_msg(filename=filename, process_attachments=True) | ||||
|  | ||||
| @ -50,6 +50,7 @@ class ElementMetadata: | ||||
|     file_directory: Optional[str] = None | ||||
|     date: Optional[str] = None | ||||
|     filetype: Optional[str] = None | ||||
|     attached_to_filename: Optional[str] = None | ||||
| 
 | ||||
|     # Page numbers currenlty supported for PDF, HTML and PPT documents | ||||
|     page_number: Optional[int] = None | ||||
|  | ||||
| @ -516,11 +516,14 @@ def add_metadata_with_filetype(filetype: FileType): | ||||
|                     kwarg: params.get(kwarg) for kwarg in ("filename", "url", "text_as_html") | ||||
|                 } | ||||
|                 for element in elements: | ||||
|                     _add_element_metadata( | ||||
|                         element, | ||||
|                         filetype=FILETYPE_TO_MIMETYPE[filetype], | ||||
|                         **metadata_kwargs,  # type: ignore | ||||
|                     ) | ||||
|                     # NOTE(robinson) - Attached files have already run through this logic | ||||
|                     # in their own partitioning function | ||||
|                     if element.metadata.attached_to_filename is None: | ||||
|                         _add_element_metadata( | ||||
|                             element, | ||||
|                             filetype=FILETYPE_TO_MIMETYPE[filetype], | ||||
|                             **metadata_kwargs,  # type: ignore | ||||
|                         ) | ||||
| 
 | ||||
|                 return elements | ||||
|             else: | ||||
|  | ||||
| @ -1,11 +1,12 @@ | ||||
| import datetime | ||||
| import email | ||||
| import os | ||||
| import re | ||||
| import sys | ||||
| from email.message import Message | ||||
| from functools import partial | ||||
| from tempfile import SpooledTemporaryFile | ||||
| from typing import IO, Dict, List, Optional, Tuple, Union | ||||
| from tempfile import SpooledTemporaryFile, TemporaryDirectory | ||||
| from typing import IO, Callable, Dict, List, Optional, Tuple, Union | ||||
| 
 | ||||
| from unstructured.file_utils.encoding import ( | ||||
|     COMMON_ENCODINGS, | ||||
| @ -226,6 +227,9 @@ def partition_email( | ||||
|     encoding: Optional[str] = None, | ||||
|     include_headers: bool = False, | ||||
|     max_partition: Optional[int] = 1500, | ||||
|     metadata_filename: Optional[str] = None, | ||||
|     process_attachments: bool = False, | ||||
|     attachment_partitioner: Optional[Callable] = None, | ||||
|     **kwargs, | ||||
| ) -> List[Element]: | ||||
|     """Partitions an .eml documents into its constituent elements. | ||||
| @ -245,6 +249,13 @@ def partition_email( | ||||
|     max_partition | ||||
|         The maximum number of characters to include in a partition. If None is passed, | ||||
|         no maximum is applied. Only applies if processing the text/plain content. | ||||
|     metadata_filename | ||||
|         The filename to use for the metadata. | ||||
|     process_attachments | ||||
|         If True, partition_email will process email attachments in addition to | ||||
|         processing the content of the email itself. | ||||
|     attachment_partitioner | ||||
|         The partitioning function to use to process attachments. | ||||
|     """ | ||||
|     if content_source not in VALID_CONTENT_SOURCES: | ||||
|         raise ValueError( | ||||
| @ -258,6 +269,8 @@ def partition_email( | ||||
|     # Verify that only one of the arguments was provided | ||||
|     exactly_one(filename=filename, file=file, text=text) | ||||
| 
 | ||||
|     metadata_filename = metadata_filename or filename | ||||
| 
 | ||||
|     detected_encoding = "utf-8" | ||||
|     if filename is not None: | ||||
|         extracted_encoding, msg = parse_email(filename=filename) | ||||
| @ -341,7 +354,25 @@ def partition_email( | ||||
|         header = partition_email_header(msg) | ||||
|     all_elements = header + elements | ||||
| 
 | ||||
|     metadata = build_email_metadata(msg, filename=filename) | ||||
|     metadata = build_email_metadata(msg, filename=metadata_filename) | ||||
|     for element in all_elements: | ||||
|         element.metadata = metadata | ||||
| 
 | ||||
|     if process_attachments: | ||||
|         with TemporaryDirectory() as tmpdir: | ||||
|             extract_attachment_info(msg, tmpdir) | ||||
|             attached_files = os.listdir(tmpdir) | ||||
|             for attached_file in attached_files: | ||||
|                 attached_filename = os.path.join(tmpdir, attached_file) | ||||
|                 if attachment_partitioner is None: | ||||
|                     raise ValueError( | ||||
|                         "Specify the attachment_partitioner kwarg to process attachments.", | ||||
|                     ) | ||||
|                 attached_elements = attachment_partitioner(filename=attached_filename) | ||||
|                 for element in attached_elements: | ||||
|                     element.metadata.filename = attached_file | ||||
|                     element.metadata.file_directory = None | ||||
|                     element.metadata.attached_to_filename = metadata_filename | ||||
|                     all_elements.append(element) | ||||
| 
 | ||||
|     return all_elements | ||||
|  | ||||
| @ -1,5 +1,6 @@ | ||||
| import os | ||||
| import tempfile | ||||
| from typing import IO, Dict, List, Optional | ||||
| from typing import IO, Callable, Dict, List, Optional | ||||
| 
 | ||||
| import msg_parser | ||||
| 
 | ||||
| @ -17,6 +18,9 @@ def partition_msg( | ||||
|     filename: Optional[str] = None, | ||||
|     file: Optional[IO] = None, | ||||
|     max_partition: Optional[int] = 1500, | ||||
|     metadata_filename: Optional[str] = None, | ||||
|     process_attachments: bool = False, | ||||
|     attachment_partitioner: Optional[Callable] = None, | ||||
|     **kwargs, | ||||
| ) -> List[Element]: | ||||
|     """Partitions a MSFT Outlook .msg file | ||||
| @ -30,6 +34,13 @@ def partition_msg( | ||||
|     max_partition | ||||
|         The maximum number of characters to include in a partition. If None is passed, | ||||
|         no maximum is applied. Only applies if processing text/plain content. | ||||
|     metadata_filename | ||||
|         The filename to use for the metadata. | ||||
|     process_attachments | ||||
|         If True, partition_email will process email attachments in addition to | ||||
|         processing the content of the email itself. | ||||
|     attachment_partitioner | ||||
|         The partitioning function to use to process attachments. | ||||
|     """ | ||||
|     exactly_one(filename=filename, file=file) | ||||
| 
 | ||||
| @ -41,16 +52,35 @@ def partition_msg( | ||||
|         tmp.close() | ||||
|         msg_obj = msg_parser.MsOxMessage(tmp.name) | ||||
| 
 | ||||
|     metadata_filename = metadata_filename or filename | ||||
| 
 | ||||
|     text = msg_obj.body | ||||
|     if "<html>" in text or "</div>" in text: | ||||
|         elements = partition_html(text=text) | ||||
|     else: | ||||
|         elements = partition_text(text=text, max_partition=max_partition) | ||||
| 
 | ||||
|     metadata = build_msg_metadata(msg_obj, filename) | ||||
|     metadata = build_msg_metadata(msg_obj, metadata_filename) | ||||
|     for element in elements: | ||||
|         element.metadata = metadata | ||||
| 
 | ||||
|     if process_attachments: | ||||
|         with tempfile.TemporaryDirectory() as tmpdir: | ||||
|             extract_msg_attachment_info(msg_obj=msg_obj, output_dir=tmpdir) | ||||
|             attached_files = os.listdir(tmpdir) | ||||
|             for attached_file in attached_files: | ||||
|                 attached_filename = os.path.join(tmpdir, attached_file) | ||||
|                 if attachment_partitioner is None: | ||||
|                     raise ValueError( | ||||
|                         "Specify the attachment_partitioner kwarg to process attachments.", | ||||
|                     ) | ||||
|                 attached_elements = attachment_partitioner(filename=attached_filename) | ||||
|                 for element in attached_elements: | ||||
|                     element.metadata.filename = attached_file | ||||
|                     element.metadata.file_directory = None | ||||
|                     element.metadata.attached_to_filename = metadata_filename | ||||
|                     elements.append(element) | ||||
| 
 | ||||
|     return elements | ||||
| 
 | ||||
| 
 | ||||
| @ -78,11 +108,12 @@ def build_msg_metadata(msg_obj: msg_parser.MsOxMessage, filename: Optional[str]) | ||||
| 
 | ||||
| 
 | ||||
| def extract_msg_attachment_info( | ||||
|     filename: str, | ||||
|     filename: Optional[str] = None, | ||||
|     file: Optional[IO] = None, | ||||
|     output_dir: Optional[str] = None, | ||||
|     msg_obj: Optional[msg_parser.MsOxMessage] = None, | ||||
| ) -> List[Dict[str, str]]: | ||||
|     exactly_one(filename=filename, file=file) | ||||
|     exactly_one(filename=filename, file=file, msg_obj=msg_obj) | ||||
| 
 | ||||
|     if filename is not None: | ||||
|         msg_obj = msg_parser.MsOxMessage(filename) | ||||
| @ -91,6 +122,8 @@ def extract_msg_attachment_info( | ||||
|         tmp.write(file.read()) | ||||
|         tmp.close() | ||||
|         msg_obj = msg_parser.MsOxMessage(tmp.name) | ||||
|     elif msg_obj is not None: | ||||
|         msg_obj = msg_obj | ||||
| 
 | ||||
|     list_attachments = [] | ||||
| 
 | ||||
| @ -105,8 +138,8 @@ def extract_msg_attachment_info( | ||||
|         list_attachments.append(attachment_info) | ||||
| 
 | ||||
|         if output_dir is not None: | ||||
|             filename = output_dir + "/" + attachment_info["filename"] | ||||
|             with open(filename, "wb") as f: | ||||
|             output_filename = output_dir + "/" + attachment_info["filename"] | ||||
|             with open(output_filename, "wb") as f: | ||||
|                 f.write(attachment.data) | ||||
| 
 | ||||
|     return list_attachments | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Matt Robinson
						Matt Robinson