| 
									
										
										
										
											2023-01-17 16:36:44 -06:00
										 |  |  | import datetime | 
					
						
							| 
									
										
										
										
											2023-01-03 11:41:54 -06:00
										 |  |  | import email | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  | import os | 
					
						
							|  |  |  | import pathlib | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  | import pytest | 
					
						
							| 
									
										
										
										
											2023-01-17 16:36:44 -06:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-04-04 14:23:41 -04:00
										 |  |  | from unstructured.documents.elements import ( | 
					
						
							|  |  |  |     ElementMetadata, | 
					
						
							|  |  |  |     Image, | 
					
						
							|  |  |  |     ListItem, | 
					
						
							|  |  |  |     NarrativeText, | 
					
						
							|  |  |  |     Title, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-01-09 11:08:08 -06:00
										 |  |  | from unstructured.documents.email_elements import ( | 
					
						
							|  |  |  |     MetaData, | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |     ReceivedInfo, | 
					
						
							| 
									
										
										
										
											2023-01-09 11:08:08 -06:00
										 |  |  |     Recipient, | 
					
						
							|  |  |  |     Sender, | 
					
						
							|  |  |  |     Subject, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | from unstructured.partition.email import ( | 
					
						
							| 
									
										
										
										
											2023-05-11 10:36:25 -04:00
										 |  |  |     convert_to_iso_8601, | 
					
						
							| 
									
										
										
										
											2023-01-09 11:08:08 -06:00
										 |  |  |     extract_attachment_info, | 
					
						
							|  |  |  |     partition_email, | 
					
						
							|  |  |  |     partition_email_header, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-06-29 18:01:12 -04:00
										 |  |  | from unstructured.partition.text import partition_text | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  | FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve() | 
					
						
							|  |  |  | EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs", "eml") | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | EXPECTED_OUTPUT = [ | 
					
						
							|  |  |  |     NarrativeText(text="This is a test email to use for unit tests."), | 
					
						
							|  |  |  |     Title(text="Important points:"), | 
					
						
							|  |  |  |     ListItem(text="Roses are red"), | 
					
						
							|  |  |  |     ListItem(text="Violets are blue"), | 
					
						
							|  |  |  | ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-09 19:49:19 -06:00
										 |  |  | IMAGE_EXPECTED_OUTPUT = [ | 
					
						
							|  |  |  |     NarrativeText(text="This is a test email to use for unit tests."), | 
					
						
							|  |  |  |     Title(text="Important points:"), | 
					
						
							|  |  |  |     NarrativeText(text="hello this is our logo."), | 
					
						
							|  |  |  |     Image(text="unstructured_logo.png"), | 
					
						
							|  |  |  |     ListItem(text="Roses are red"), | 
					
						
							|  |  |  |     ListItem(text="Violets are blue"), | 
					
						
							|  |  |  | ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-17 16:36:44 -06:00
										 |  |  | RECEIVED_HEADER_OUTPUT = [ | 
					
						
							|  |  |  |     ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="00.0.0.00"), | 
					
						
							|  |  |  |     ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="ba23::58b5:2236:45g2:88h2"), | 
					
						
							|  |  |  |     ReceivedInfo( | 
					
						
							|  |  |  |         name="received_datetimetz", | 
					
						
							|  |  |  |         text="2023-02-20 10:03:18+12:00", | 
					
						
							|  |  |  |         datestamp=datetime.datetime( | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |             2023, | 
					
						
							|  |  |  |             2, | 
					
						
							|  |  |  |             20, | 
					
						
							|  |  |  |             10, | 
					
						
							|  |  |  |             3, | 
					
						
							|  |  |  |             18, | 
					
						
							|  |  |  |             tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)), | 
					
						
							| 
									
										
										
										
											2023-01-17 16:36:44 -06:00
										 |  |  |         ), | 
					
						
							|  |  |  |     ), | 
					
						
							|  |  |  |     MetaData(name="MIME-Version", text="1.0"), | 
					
						
							|  |  |  |     MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"), | 
					
						
							|  |  |  |     MetaData( | 
					
						
							|  |  |  |         name="Message-ID", | 
					
						
							|  |  |  |         text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>", | 
					
						
							|  |  |  |     ), | 
					
						
							|  |  |  |     Subject(text="Test Email"), | 
					
						
							|  |  |  |     Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"), | 
					
						
							|  |  |  |     Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"), | 
					
						
							|  |  |  |     MetaData( | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |         name="Content-Type", | 
					
						
							|  |  |  |         text='multipart/alternative; boundary="00000000000095c9b205eff92630"', | 
					
						
							| 
									
										
										
										
											2023-01-17 16:36:44 -06:00
										 |  |  |     ), | 
					
						
							|  |  |  | ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-09 11:08:08 -06:00
										 |  |  | HEADER_EXPECTED_OUTPUT = [ | 
					
						
							|  |  |  |     MetaData(name="MIME-Version", text="1.0"), | 
					
						
							|  |  |  |     MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"), | 
					
						
							|  |  |  |     MetaData( | 
					
						
							|  |  |  |         name="Message-ID", | 
					
						
							|  |  |  |         text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>", | 
					
						
							|  |  |  |     ), | 
					
						
							|  |  |  |     Subject(text="Test Email"), | 
					
						
							|  |  |  |     Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"), | 
					
						
							|  |  |  |     Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"), | 
					
						
							|  |  |  |     MetaData( | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |         name="Content-Type", | 
					
						
							|  |  |  |         text='multipart/alternative; boundary="00000000000095c9b205eff92630"', | 
					
						
							| 
									
										
										
										
											2023-01-09 11:08:08 -06:00
										 |  |  |     ), | 
					
						
							|  |  |  | ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ALL_EXPECTED_OUTPUT = HEADER_EXPECTED_OUTPUT + EXPECTED_OUTPUT | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-03 11:41:54 -06:00
										 |  |  | ATTACH_EXPECTED_OUTPUT = [ | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |     {"filename": "fake-attachment.txt", "payload": b"Hey this is a fake attachment!"}, | 
					
						
							| 
									
										
										
										
											2023-01-03 11:41:54 -06:00
										 |  |  | ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | def test_partition_email_from_filename(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  |     elements = partition_email(filename=filename) | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							|  |  |  |     assert elements == EXPECTED_OUTPUT | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename == "fake-email.eml" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_email_from_filename_with_metadata_filename(): | 
					
						
							|  |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") | 
					
						
							|  |  |  |     elements = partition_email(filename=filename, metadata_filename="test") | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							|  |  |  |     assert all(element.metadata.filename == "test" for element in elements) | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-06 09:49:27 -04:00
										 |  |  | def test_partition_email_from_filename_malformed_encoding(): | 
					
						
							|  |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-malformed-encoding.eml") | 
					
						
							|  |  |  |     elements = partition_email(filename=filename) | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							|  |  |  |     assert elements == EXPECTED_OUTPUT | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-30 10:24:02 -07:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     ("filename", "expected_output"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ("fake-email-utf-16.eml", EXPECTED_OUTPUT), | 
					
						
							|  |  |  |         ("fake-email-utf-16-be.eml", EXPECTED_OUTPUT), | 
					
						
							|  |  |  |         ("fake-email-utf-16-le.eml", EXPECTED_OUTPUT), | 
					
						
							|  |  |  |         ("email-no-utf8-2008-07-16.062410.eml", None), | 
					
						
							|  |  |  |         ("email-no-utf8-2014-03-17.111517.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-1.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-2.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-3.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-4.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-5.eml", None), | 
					
						
							|  |  |  |     ], | 
					
						
							| 
									
										
										
										
											2023-05-30 10:24:02 -07:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  | def test_partition_email_from_filename_default_encoding(filename, expected_output): | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename) | 
					
						
							|  |  |  |     elements = partition_email(filename=filename_path) | 
					
						
							| 
									
										
										
										
											2023-05-30 10:24:02 -07:00
										 |  |  |     assert len(elements) > 0 | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     if expected_output: | 
					
						
							|  |  |  |         assert elements == expected_output | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename == filename | 
					
						
							| 
									
										
										
										
											2023-05-30 10:24:02 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  | def test_partition_email_from_file(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |     with open(filename) as f: | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  |         elements = partition_email(file=f) | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							|  |  |  |     assert elements == EXPECTED_OUTPUT | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename is None | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-30 10:24:02 -07:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     ("filename", "expected_output"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ("fake-email-utf-16.eml", EXPECTED_OUTPUT), | 
					
						
							|  |  |  |         ("fake-email-utf-16-be.eml", EXPECTED_OUTPUT), | 
					
						
							|  |  |  |         ("fake-email-utf-16-le.eml", EXPECTED_OUTPUT), | 
					
						
							|  |  |  |         ("email-no-utf8-2008-07-16.062410.eml", None), | 
					
						
							|  |  |  |         ("email-no-utf8-2014-03-17.111517.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-1.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-2.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-3.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-4.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-5.eml", None), | 
					
						
							|  |  |  |     ], | 
					
						
							| 
									
										
										
										
											2023-05-30 10:24:02 -07:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  | def test_partition_email_from_file_default_encoding(filename, expected_output): | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename) | 
					
						
							|  |  |  |     with open(filename_path) as f: | 
					
						
							| 
									
										
										
										
											2023-05-30 10:24:02 -07:00
										 |  |  |         elements = partition_email(file=f) | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     if expected_output: | 
					
						
							|  |  |  |         assert elements == expected_output | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename is None | 
					
						
							| 
									
										
										
										
											2023-05-30 10:24:02 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-09 16:15:14 -05:00
										 |  |  | def test_partition_email_from_file_rb(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") | 
					
						
							| 
									
										
										
										
											2023-01-09 16:15:14 -05:00
										 |  |  |     with open(filename, "rb") as f: | 
					
						
							|  |  |  |         elements = partition_email(file=f) | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							|  |  |  |     assert elements == EXPECTED_OUTPUT | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename is None | 
					
						
							| 
									
										
										
										
											2023-01-09 16:15:14 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-05-30 10:24:02 -07:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     ("filename", "expected_output"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ("fake-email-utf-16.eml", EXPECTED_OUTPUT), | 
					
						
							|  |  |  |         ("fake-email-utf-16-be.eml", EXPECTED_OUTPUT), | 
					
						
							|  |  |  |         ("fake-email-utf-16-le.eml", EXPECTED_OUTPUT), | 
					
						
							|  |  |  |         ("email-no-utf8-2008-07-16.062410.eml", None), | 
					
						
							|  |  |  |         ("email-no-utf8-2014-03-17.111517.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-1.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-2.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-3.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-4.eml", None), | 
					
						
							|  |  |  |         ("email-replace-mime-encodings-error-5.eml", None), | 
					
						
							|  |  |  |     ], | 
					
						
							| 
									
										
										
										
											2023-05-30 10:24:02 -07:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  | def test_partition_email_from_file_rb_default_encoding(filename, expected_output): | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     filename_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename) | 
					
						
							|  |  |  |     with open(filename_path, "rb") as f: | 
					
						
							| 
									
										
										
										
											2023-05-30 10:24:02 -07:00
										 |  |  |         elements = partition_email(file=f) | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     if expected_output: | 
					
						
							|  |  |  |         assert elements == expected_output | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename is None | 
					
						
							| 
									
										
										
										
											2023-05-30 10:24:02 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-09 11:08:08 -06:00
										 |  |  | def test_partition_email_from_text_file(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |     with open(filename) as f: | 
					
						
							| 
									
										
										
										
											2023-01-09 11:08:08 -06:00
										 |  |  |         elements = partition_email(file=f, content_source="text/plain") | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							|  |  |  |     assert elements == EXPECTED_OUTPUT | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename is None | 
					
						
							| 
									
										
										
										
											2023-01-09 11:08:08 -06:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_email_from_text_file_with_headers(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |     with open(filename) as f: | 
					
						
							| 
									
										
										
										
											2023-01-09 11:08:08 -06:00
										 |  |  |         elements = partition_email(file=f, content_source="text/plain", include_headers=True) | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							|  |  |  |     assert elements == ALL_EXPECTED_OUTPUT | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename is None | 
					
						
							| 
									
										
										
										
											2023-01-09 11:08:08 -06:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-24 10:57:24 -05:00
										 |  |  | def test_partition_email_from_text_file_max(): | 
					
						
							|  |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") | 
					
						
							|  |  |  |     with open(filename) as f: | 
					
						
							|  |  |  |         elements = partition_email(file=f, content_source="text/plain", max_partition=20) | 
					
						
							|  |  |  |     assert len(elements) == 6 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_email_from_text_file_raises_value_error(): | 
					
						
							|  |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") | 
					
						
							|  |  |  |     with pytest.raises(ValueError), open(filename) as f: | 
					
						
							|  |  |  |         partition_email(file=f, content_source="text/plain", min_partition=1000) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  | def test_partition_email_from_text(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |     with open(filename) as f: | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  |         text = f.read() | 
					
						
							|  |  |  |     elements = partition_email(text=text) | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							|  |  |  |     assert elements == EXPECTED_OUTPUT | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename is None | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-28 17:03:51 -04:00
										 |  |  | def test_partition_email_from_text_work_with_empty_string(): | 
					
						
							|  |  |  |     assert partition_email(text="") == [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-09 19:49:19 -06:00
										 |  |  | def test_partition_email_from_filename_with_embedded_image(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-image-embedded.eml") | 
					
						
							| 
									
										
										
										
											2023-01-09 19:49:19 -06:00
										 |  |  |     elements = partition_email(filename=filename, content_source="text/plain") | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							|  |  |  |     assert elements == IMAGE_EXPECTED_OUTPUT | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename == "fake-email-image-embedded.eml" | 
					
						
							| 
									
										
										
										
											2023-01-09 19:49:19 -06:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  | def test_partition_email_from_file_with_header(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml") | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |     with open(filename) as f: | 
					
						
							| 
									
										
										
										
											2023-01-09 11:08:08 -06:00
										 |  |  |         msg = email.message_from_file(f) | 
					
						
							|  |  |  |     elements = partition_email_header(msg) | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							| 
									
										
										
										
											2023-01-17 16:36:44 -06:00
										 |  |  |     assert elements == RECEIVED_HEADER_OUTPUT | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename is None | 
					
						
							| 
									
										
										
										
											2023-01-09 11:08:08 -06:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  | def test_partition_email_from_filename_has_metadata(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml") | 
					
						
							| 
									
										
										
										
											2023-04-04 14:23:41 -04:00
										 |  |  |     elements = partition_email(filename=filename) | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							| 
									
										
										
										
											2023-06-16 10:10:56 -04:00
										 |  |  |     assert ( | 
					
						
							|  |  |  |         elements[0].metadata.to_dict() | 
					
						
							|  |  |  |         == ElementMetadata( | 
					
						
							| 
									
										
										
										
											2023-07-05 11:25:11 -07:00
										 |  |  |             coordinates=None, | 
					
						
							| 
									
										
										
										
											2023-06-16 10:10:56 -04:00
										 |  |  |             filename=filename, | 
					
						
							|  |  |  |             date="2022-12-16T17:04:16-05:00", | 
					
						
							|  |  |  |             page_number=None, | 
					
						
							|  |  |  |             url=None, | 
					
						
							|  |  |  |             sent_from=["Matthew Robinson <mrobinson@unstructured.io>"], | 
					
						
							|  |  |  |             sent_to=["Matthew Robinson <mrobinson@unstructured.io>"], | 
					
						
							|  |  |  |             subject="Test Email", | 
					
						
							|  |  |  |             filetype="message/rfc822", | 
					
						
							|  |  |  |         ).to_dict() | 
					
						
							| 
									
										
										
										
											2023-04-04 14:23:41 -04:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-05-12 11:33:01 -04:00
										 |  |  |     expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00") | 
					
						
							|  |  |  |     assert elements[0].metadata.get_date() == expected_dt | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename == "fake-email-header.eml" | 
					
						
							| 
									
										
										
										
											2023-05-12 11:33:01 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-04-04 14:23:41 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-17 11:33:45 -05:00
										 |  |  | def test_extract_email_text_matches_html(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml") | 
					
						
							| 
									
										
										
										
											2023-01-17 11:33:45 -05:00
										 |  |  |     elements_from_text = partition_email(filename=filename, content_source="text/plain") | 
					
						
							|  |  |  |     elements_from_html = partition_email(filename=filename, content_source="text/html") | 
					
						
							|  |  |  |     assert len(elements_from_text) == len(elements_from_html) | 
					
						
							|  |  |  |     # NOTE(robinson) - checking each individually is necessary because the text/html returns | 
					
						
							|  |  |  |     # HTMLTitle, HTMLNarrativeText, etc | 
					
						
							|  |  |  |     for i, element in enumerate(elements_from_text): | 
					
						
							|  |  |  |         assert element == elements_from_text[i] | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |         assert element.metadata.filename == "fake-email-attachment.eml" | 
					
						
							| 
									
										
										
										
											2023-01-17 11:33:45 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-03 11:41:54 -06:00
										 |  |  | def test_extract_attachment_info(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml") | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |     with open(filename) as f: | 
					
						
							| 
									
										
										
										
											2023-01-03 11:41:54 -06:00
										 |  |  |         msg = email.message_from_file(f) | 
					
						
							|  |  |  |     attachment_info = extract_attachment_info(msg) | 
					
						
							|  |  |  |     assert len(attachment_info) > 0 | 
					
						
							|  |  |  |     assert attachment_info == ATTACH_EXPECTED_OUTPUT | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  | def test_partition_email_raises_with_none_specified(): | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_email() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_email_raises_with_too_many_specified(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") | 
					
						
							| 
									
										
										
										
											2023-02-27 17:30:54 +01:00
										 |  |  |     with open(filename) as f: | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  |         text = f.read() | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_email(filename=filename, text=text) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_email_raises_with_invalid_content_type(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") | 
					
						
							| 
									
										
										
										
											2022-12-19 13:02:44 -05:00
										 |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_email(filename=filename, content_source="application/json") | 
					
						
							| 
									
										
										
										
											2023-03-10 18:10:39 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_email_processes_fake_email_with_header(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml") | 
					
						
							| 
									
										
										
										
											2023-03-10 18:10:39 -05:00
										 |  |  |     elements = partition_email(filename=filename) | 
					
						
							|  |  |  |     assert len(elements) > 0 | 
					
						
							| 
									
										
										
										
											2023-07-05 15:02:22 -05:00
										 |  |  |     for element in elements: | 
					
						
							|  |  |  |         assert element.metadata.filename == "fake-email-header.eml" | 
					
						
							| 
									
										
										
										
											2023-05-11 10:36:25 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     (("time", "expected")), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ("Thu,  4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"), | 
					
						
							|  |  |  |         ("Thu, 4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"), | 
					
						
							|  |  |  |         ("Thu, 4 May 2023 02:32:49 +0000 (UTC)", "2023-05-04T02:32:49+00:00"), | 
					
						
							|  |  |  |         ("Thursday 5/3/2023 02:32:49", None), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_convert_to_iso_8601(time, expected): | 
					
						
							|  |  |  |     iso_time = convert_to_iso_8601(time) | 
					
						
							|  |  |  |     assert iso_time == expected | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-22 12:52:27 -04:00
										 |  |  | def test_partition_email_still_works_with_no_content(): | 
					
						
							| 
									
										
										
										
											2023-06-16 17:52:13 -07:00
										 |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "email-no-html-content-1.eml") | 
					
						
							| 
									
										
										
										
											2023-06-22 12:52:27 -04:00
										 |  |  |     elements = partition_email(filename=filename) | 
					
						
							|  |  |  |     assert elements == [] | 
					
						
							| 
									
										
										
										
											2023-06-29 18:01:12 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-30 09:44:46 -05:00
										 |  |  | def test_partition_email_from_filename_exclude_metadata(): | 
					
						
							|  |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml") | 
					
						
							|  |  |  |     elements = partition_email(filename=filename, include_metadata=False) | 
					
						
							|  |  |  |     assert elements[0].metadata.get_date() is None | 
					
						
							|  |  |  |     assert elements[0].metadata.filetype is None | 
					
						
							|  |  |  |     assert elements[0].metadata.page_name is None | 
					
						
							|  |  |  |     assert elements[0].metadata.filename is None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_email_from_text_file_exclude_metadata(): | 
					
						
							|  |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.txt") | 
					
						
							|  |  |  |     with open(filename) as f: | 
					
						
							|  |  |  |         elements = partition_email(file=f, content_source="text/plain", include_metadata=False) | 
					
						
							|  |  |  |     assert elements[0].metadata.get_date() is None | 
					
						
							|  |  |  |     assert elements[0].metadata.filetype is None | 
					
						
							|  |  |  |     assert elements[0].metadata.page_name is None | 
					
						
							|  |  |  |     assert elements[0].metadata.filename is None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_email_from_file_exclude_metadata(): | 
					
						
							|  |  |  |     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml") | 
					
						
							|  |  |  |     with open(filename) as f: | 
					
						
							|  |  |  |         elements = partition_email(file=f, include_metadata=False) | 
					
						
							|  |  |  |     assert elements[0].metadata.get_date() is None | 
					
						
							|  |  |  |     assert elements[0].metadata.filetype is None | 
					
						
							|  |  |  |     assert elements[0].metadata.page_name is None | 
					
						
							|  |  |  |     assert elements[0].metadata.filename is None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-06-29 18:01:12 -04:00
										 |  |  | def test_partition_email_can_process_attachments( | 
					
						
							|  |  |  |     tmpdir, | 
					
						
							|  |  |  |     filename="example-docs/eml/fake-email-attachment.eml", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     with open(filename) as f: | 
					
						
							|  |  |  |         msg = email.message_from_file(f) | 
					
						
							|  |  |  |     extract_attachment_info(msg, output_dir=tmpdir.dirname) | 
					
						
							|  |  |  |     attachment_filename = os.path.join(tmpdir.dirname, ATTACH_EXPECTED_OUTPUT[0]["filename"]) | 
					
						
							|  |  |  |     attachment_elements = partition_text( | 
					
						
							|  |  |  |         filename=attachment_filename, | 
					
						
							|  |  |  |         metadata_filename=attachment_filename, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     expected_metadata = attachment_elements[0].metadata | 
					
						
							|  |  |  |     expected_metadata.file_directory = None | 
					
						
							|  |  |  |     expected_metadata.attached_to_filename = filename | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     elements = partition_email( | 
					
						
							|  |  |  |         filename=filename, | 
					
						
							|  |  |  |         attachment_partitioner=partition_text, | 
					
						
							|  |  |  |         process_attachments=True, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert elements[0].text.startswith("Hello!") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for element in elements[:-1]: | 
					
						
							|  |  |  |         assert element.metadata.filename == "fake-email-attachment.eml" | 
					
						
							|  |  |  |         assert element.metadata.subject == "Fake email with attachment" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert elements[-1].text == "Hey this is a fake attachment!" | 
					
						
							|  |  |  |     assert elements[-1].metadata == expected_metadata | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_partition_msg_raises_with_no_partitioner( | 
					
						
							|  |  |  |     filename="example-docs/eml/fake-email-attachment.eml", | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         partition_email(filename=filename, process_attachments=True) |