mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-10 07:27:34 +00:00
fix: add handling for non-standard rfc-2822 formats (#564)
* fix: add handling for non-standard rfc-2822 formats * version and changelog * linting, linting, linting
This commit is contained in:
parent
f46eb06e2d
commit
38f7b652de
@ -8,8 +8,6 @@
|
|||||||
|
|
||||||
* fix: fileutils/file_type check json and eml decode ignore error
|
* fix: fileutils/file_type check json and eml decode ignore error
|
||||||
|
|
||||||
## 0.6.6-dev0
|
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* Added an additional trace logger for NLP debugging.
|
* Added an additional trace logger for NLP debugging.
|
||||||
@ -18,6 +16,8 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* `partition_email` was updated to more flexibly handle deviations from the RFC-2822 standard.
|
||||||
|
The time in the metadata returns `None` if the time does not match RFC-2822 at all.
|
||||||
* Include all metadata fields when converting to dataframe or CSV
|
* Include all metadata fields when converting to dataframe or CSV
|
||||||
|
|
||||||
## 0.6.5
|
## 0.6.5
|
||||||
|
|||||||
@ -20,6 +20,7 @@ from unstructured.documents.email_elements import (
|
|||||||
Subject,
|
Subject,
|
||||||
)
|
)
|
||||||
from unstructured.partition.email import (
|
from unstructured.partition.email import (
|
||||||
|
convert_to_iso_8601,
|
||||||
extract_attachment_info,
|
extract_attachment_info,
|
||||||
partition_email,
|
partition_email,
|
||||||
partition_email_header,
|
partition_email_header,
|
||||||
@ -226,3 +227,17 @@ def test_partition_email_processes_fake_email_with_header():
|
|||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
|
||||||
elements = partition_email(filename=filename)
|
elements = partition_email(filename=filename)
|
||||||
assert len(elements) > 0
|
assert len(elements) > 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
(("time", "expected")),
|
||||||
|
[
|
||||||
|
("Thu, 4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"),
|
||||||
|
("Thu, 4 May 2023 02:32:49 +0000", "2023-05-04T02:32:49+00:00"),
|
||||||
|
("Thu, 4 May 2023 02:32:49 +0000 (UTC)", "2023-05-04T02:32:49+00:00"),
|
||||||
|
("Thursday 5/3/2023 02:32:49", None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_convert_to_iso_8601(time, expected):
|
||||||
|
iso_time = convert_to_iso_8601(time)
|
||||||
|
assert iso_time == expected
|
||||||
|
|||||||
@ -89,8 +89,11 @@ MAPI_ID_PATTERN = "[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*;" # noqa: W605 NOTE(harrell)
|
|||||||
# - skipping qa because we need the escape for the regex
|
# - skipping qa because we need the escape for the regex
|
||||||
|
|
||||||
# Date, time, timezone example: Fri, 26 Mar 2021 11:04:09 +1200
|
# Date, time, timezone example: Fri, 26 Mar 2021 11:04:09 +1200
|
||||||
EMAIL_DATETIMETZ_PATTERN = "[a-zA-z]{3},\s[0-9]{2}\s[a-zA-Z]{3}\s[0-9]{4}\s[0-9]{2}:[0-9]{2}:[0-9]{2}\s[+0-9]{5}" # noqa: W605,E501
|
|
||||||
# NOTE(harrell) - skipping qa because we need the escape for the regex
|
# NOTE(harrell) - skipping qa because we need the escape for the regex
|
||||||
|
EMAIL_DATETIMETZ_PATTERN = (
|
||||||
|
r"[A-Za-z]{3},\s\d{1,2}\s[A-Za-z]{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s[+-]\d{4}" # noqa: W605,E501
|
||||||
|
)
|
||||||
|
EMAIL_DATETIMETZ_PATTERN_RE = re.compile(EMAIL_DATETIMETZ_PATTERN)
|
||||||
|
|
||||||
EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell)
|
EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell)
|
||||||
# - skipping qa because we need the escape for the regex
|
# - skipping qa because we need the escape for the regex
|
||||||
|
|||||||
@ -36,6 +36,8 @@ from unstructured.documents.email_elements import (
|
|||||||
Sender,
|
Sender,
|
||||||
Subject,
|
Subject,
|
||||||
)
|
)
|
||||||
|
from unstructured.logger import logger
|
||||||
|
from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE
|
||||||
from unstructured.partition.html import partition_html
|
from unstructured.partition.html import partition_html
|
||||||
from unstructured.partition.text import partition_text, split_by_paragraph
|
from unstructured.partition.text import partition_text, split_by_paragraph
|
||||||
|
|
||||||
@ -112,9 +114,17 @@ def build_email_metadata(msg: Message) -> ElementMetadata:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def convert_to_iso_8601(time: str) -> str:
|
def convert_to_iso_8601(time: str) -> Optional[str]:
|
||||||
"""Converts the datetime from the email output to ISO-8601 format."""
|
"""Converts the datetime from the email output to ISO-8601 format."""
|
||||||
datetime_object = datetime.datetime.strptime(time, "%a, %d %b %Y %H:%M:%S %z")
|
cleaned_time = clean_extra_whitespace(time)
|
||||||
|
regex_match = EMAIL_DATETIMETZ_PATTERN_RE.search(cleaned_time)
|
||||||
|
if regex_match is None:
|
||||||
|
logger.warning(f"{time} did not match RFC-2822 format. Unable to extract the time.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
start, end = regex_match.span()
|
||||||
|
dt_string = cleaned_time[start:end]
|
||||||
|
datetime_object = datetime.datetime.strptime(dt_string, "%a, %d %b %Y %H:%M:%S %z")
|
||||||
return datetime_object.isoformat()
|
return datetime_object.isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user