chore: Fix parse received data (#143)

* fix parse_received data
This commit is contained in:
Mallori Harrell 2023-01-17 16:36:44 -06:00 committed by GitHub
parent 749f9c6be8
commit 08ccee0acb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 75 additions and 16 deletions

View File

@ -1,13 +1,13 @@
## 0.4.3-dev0
## 0.4.3-dev1
* Fix in `exceeds_cap_ratio` so the function doesn't break with empty text
* Fix bug in `_parse_received_data`.
## 0.4.2
* Added `partition_image` to process documents in an image format.
* Fixed utf-8 encoding error in `partition_email` with attachments for `text/html`
## 0.4.1
* Added support for text files in the `partition` function
@ -40,7 +40,7 @@
elements
* Add ability to extract document metadata from `.docx`, `.xlsx`, and `.jpg` files.
* Helper functions for identifying and extracting phone numbers
* Add new function `extract_attachment_info` that extracts and decode the attachment
* Add new function `extract_attachment_info` that extracts and decodes the attachment
of an email.
* Staging brick to convert a list of `Element`s to a `pandas` dataframe.
* Add plain text functionality to `partition_email`

View File

@ -0,0 +1,28 @@
Received: from ABCDEFG-000.ABC.guide (00.0.0.00) by ABCDEFG-000.ABC.guide
([ba23::58b5:2236:45g2:88h2]) with Unstructured TTTT Server (version=ABC0_0,
cipher=ABC_ABCDE_ABC_NOPE_ABC_000_ABC_ABC000) id 00.0.000.0 via Techbox
Transport; Wed, 20 Feb 2023 10:03:18 +1200
MIME-Version: 1.0
Date: Fri, 16 Dec 2022 17:04:16 -0500
Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
Subject: Test Email
From: Matthew Robinson <mrobinson@unstructured.io>
To: Matthew Robinson <mrobinson@unstructured.io>
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
--00000000000095c9b205eff92630
Content-Type: text/plain; charset="UTF-8"
This is a test email to use for unit tests.
Important points:
- Roses are red
- Violets are blue
--00000000000095c9b205eff92630
Content-Type: text/html; charset="UTF-8"
<div dir="ltr"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div>
--00000000000095c9b205eff92630--

View File

@ -1,14 +1,17 @@
import datetime
import email
import os
import pathlib
import pytest
from unstructured.documents.elements import NarrativeText, Title, ListItem, Image
from unstructured.documents.email_elements import (
MetaData,
Recipient,
Sender,
Subject,
ReceivedInfo,
)
from unstructured.partition.email import (
extract_attachment_info,
@ -36,6 +39,30 @@ IMAGE_EXPECTED_OUTPUT = [
ListItem(text="Violets are blue"),
]
RECEIVED_HEADER_OUTPUT = [
ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="00.0.0.00"),
ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="ba23::58b5:2236:45g2:88h2"),
ReceivedInfo(
name="received_datetimetz",
text="2023-02-20 10:03:18+12:00",
datestamp=datetime.datetime(
2023, 2, 20, 10, 3, 18, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200))
),
),
MetaData(name="MIME-Version", text="1.0"),
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
MetaData(
name="Message-ID",
text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
),
Subject(text="Test Email"),
Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
MetaData(
name="Content-Type", text='multipart/alternative; boundary="00000000000095c9b205eff92630"'
),
]
HEADER_EXPECTED_OUTPUT = [
MetaData(name="MIME-Version", text="1.0"),
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
@ -114,12 +141,12 @@ def test_partition_email_from_filename_with_embedded_image():
def test_partition_email_header():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
with open(filename, "r") as f:
msg = email.message_from_file(f)
elements = partition_email_header(msg)
assert len(elements) > 0
assert elements == HEADER_EXPECTED_OUTPUT
assert elements == RECEIVED_HEADER_OUTPUT
def test_extract_email_text_matches_html():

View File

@ -1 +1 @@
__version__ = "0.4.3-dev0" # pragma: no cover
__version__ = "0.4.3-dev1" # pragma: no cover

View File

@ -5,6 +5,12 @@ from typing import Callable, List, Union
from unstructured.documents.elements import Element, Text, NoID
class NoDatestamp(ABC):
"""Class to indicate that an element do not have a datetime stamp."""
pass
class EmailElement(Element):
"""An email element is a section of the email."""
@ -20,12 +26,11 @@ class Name(EmailElement):
self,
name: str,
text: str,
datestamp: Union[datetime, NoDatestamp] = NoDatestamp(),
element_id: Union[str, NoID] = NoID(),
):
self.name: str = name
self.text: str = text
self.datestamp: datetime
self.has_datestamp: bool = False
if isinstance(element_id, NoID):
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
@ -33,15 +38,17 @@ class Name(EmailElement):
super().__init__(element_id=element_id)
def set_datestamp(self, datestamp: datetime):
self.datestamp = datestamp
self.has_datestamp = True
if isinstance(datestamp, datetime):
self.datestamp: datetime = datestamp
def has_datestamp(self):
return "self.datestamp" in globals()
def __str__(self):
return f"{self.name}: {self.text}"
def __eq__(self, other):
if self.has_datestamp:
if self.has_datestamp():
return (
self.name == other.name
and self.text == other.text

View File

@ -47,11 +47,8 @@ def _parse_received_data(data: str) -> List[Element]:
elements.append(ReceivedInfo(name="mapi_id", text=mapi_id[0]))
if datetimetz:
elements.append(
ReceivedInfo(name="received_datetimetz", text=str(datetimetz)).set_datestamp(
datestamp=datetimetz
)
ReceivedInfo(name="received_datetimetz", text=str(datetimetz), datestamp=datetimetz)
)
return elements