diff --git a/CHANGELOG.md b/CHANGELOG.md index ff0c2235c..2d4d5e980 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.7.5-dev0 + +### Enhancements + +### Features + +### Fixes + +* Adds handling for emails that do not have a datetime to extract. + ## 0.7.4 ### Enhancements diff --git a/test_unstructured/cleaners/test_extract.py b/test_unstructured/cleaners/test_extract.py index 346f2f2ac..e54b2f74e 100644 --- a/test_unstructured/cleaners/test_extract.py +++ b/test_unstructured/cleaners/test_extract.py @@ -64,6 +64,10 @@ def test_extract_datetimetz(): ) +def test_extract_datetimetz_works_with_no_date(): + assert extract.extract_datetimetz("NO DATE HERE") is None + + @pytest.mark.parametrize( ("text", "expected"), [ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 63ee408a7..1da145839 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.7.4" # pragma: no cover +__version__ = "0.7.5-dev0" # pragma: no cover diff --git a/unstructured/cleaners/extract.py b/unstructured/cleaners/extract.py index 51cc031f2..69c399984 100644 --- a/unstructured/cleaners/extract.py +++ b/unstructured/cleaners/extract.py @@ -1,6 +1,6 @@ import datetime import re -from typing import List +from typing import List, Optional from unstructured.nlp.patterns import ( EMAIL_ADDRESS_PATTERN, @@ -75,9 +75,12 @@ def extract_mapi_id(text: str) -> List[str]: return mapi_ids -def extract_datetimetz(text: str) -> datetime.datetime: - date_string = re.findall(EMAIL_DATETIMETZ_PATTERN, text) - return datetime.datetime.strptime(date_string[0], "%a, %d %b %Y %H:%M:%S %z") +def extract_datetimetz(text: str) -> Optional[datetime.datetime]: + date_extractions = re.findall(EMAIL_DATETIMETZ_PATTERN, text) + if len(date_extractions) > 0: + return datetime.datetime.strptime(date_extractions[0], "%a, %d %b %Y %H:%M:%S %z") + else: + return None def extract_us_phone_number(text: str):