diff --git a/CHANGELOG.md b/CHANGELOG.md index 828a738e0..12c49e9d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,6 @@ * fix: fileutils/file_type check json and eml decode ignore error -## 0.6.6-dev0 - ### Enhancements * Added an additional trace logger for NLP debugging. @@ -18,6 +16,8 @@ ### Fixes +* `partition_email` was updated to more flexibly handle deviations from the RFC-2822 standard. + The time in the metadata returns `None` if the time does not match RFC-2822 at all. * Include all metadata fields when converting to dataframe or CSV ## 0.6.5 diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index 65a368a42..7e6f6cac4 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -20,6 +20,7 @@ from unstructured.documents.email_elements import ( Subject, ) from unstructured.partition.email import ( + convert_to_iso_8601, extract_attachment_info, partition_email, partition_email_header, @@ -226,3 +227,17 @@ def test_partition_email_processes_fake_email_with_header(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml") elements = partition_email(filename=filename) assert len(elements) > 0 + + +@pytest.mark.parametrize( + (("time", "expected")), + [ + ("Thu, 4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"), + ("Thu, 4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"), + ("Thu, 4 May 2023 02:32:49 +0000 (UTC)", "2023-05-04T02:32:49+00:00"), + ("Thursday 5/3/2023 02:32:49", None), + ], +) +def test_convert_to_iso_8601(time, expected): + iso_time = convert_to_iso_8601(time) + assert iso_time == expected diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py index 475f06be2..733751f74 100644 --- a/unstructured/nlp/patterns.py +++ b/unstructured/nlp/patterns.py @@ -89,8 +89,11 @@ MAPI_ID_PATTERN = "[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*;" # noqa: W605 NOTE(harrell) # - skipping qa because we need the escape for the regex # Date, time, timezone example: Fri, 26 Mar 2021 11:04:09 +1200 -EMAIL_DATETIMETZ_PATTERN = "[a-zA-z]{3},\s[0-9]{2}\s[a-zA-Z]{3}\s[0-9]{4}\s[0-9]{2}:[0-9]{2}:[0-9]{2}\s[+0-9]{5}" # noqa: W605,E501 # NOTE(harrell) - skipping qa because we need the escape for the regex +EMAIL_DATETIMETZ_PATTERN = ( + r"[A-Za-z]{3},\s\d{1,2}\s[A-Za-z]{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s[+-]\d{4}" # noqa: W605,E501 +) +EMAIL_DATETIMETZ_PATTERN_RE = re.compile(EMAIL_DATETIMETZ_PATTERN) EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell) # - skipping qa because we need the escape for the regex diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index cda69f0de..35ecec491 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -36,6 +36,8 @@ from unstructured.documents.email_elements import ( Sender, Subject, ) +from unstructured.logger import logger +from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE from unstructured.partition.html import partition_html from unstructured.partition.text import partition_text, split_by_paragraph @@ -112,9 +114,17 @@ def build_email_metadata(msg: Message) -> ElementMetadata: ) -def convert_to_iso_8601(time: str) -> str: +def convert_to_iso_8601(time: str) -> Optional[str]: """Converts the datetime from the email output to ISO-8601 format.""" - datetime_object = datetime.datetime.strptime(time, "%a, %d %b %Y %H:%M:%S %z") + cleaned_time = clean_extra_whitespace(time) + regex_match = EMAIL_DATETIMETZ_PATTERN_RE.search(cleaned_time) + if regex_match is None: + logger.warning(f"{time} did not match RFC-2822 format. Unable to extract the time.") + return None + + start, end = regex_match.span() + dt_string = cleaned_time[start:end] + datetime_object = datetime.datetime.strptime(dt_string, "%a, %d %b %Y %H:%M:%S %z") return datetime_object.isoformat()