mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 06:04:53 +00:00
parent
749f9c6be8
commit
08ccee0acb
@ -1,13 +1,13 @@
|
||||
## 0.4.3-dev0
|
||||
## 0.4.3-dev1
|
||||
|
||||
* Fix in `exceeds_cap_ratio` so the function doesn't break with empty text
|
||||
* Fix bug in `_parse_received_data`.
|
||||
|
||||
## 0.4.2
|
||||
|
||||
* Added `partition_image` to process documents in an image format.
|
||||
* Fixed utf-8 encoding error in `partition_email` with attachments for `text/html`
|
||||
|
||||
|
||||
## 0.4.1
|
||||
|
||||
* Added support for text files in the `partition` function
|
||||
@ -40,7 +40,7 @@
|
||||
elements
|
||||
* Add ability to extract document metadata from `.docx`, `.xlsx`, and `.jpg` files.
|
||||
* Helper functions for identifying and extracting phone numbers
|
||||
* Add new function `extract_attachment_info` that extracts and decode the attachment
|
||||
* Add new function `extract_attachment_info` that extracts and decodes the attachment
|
||||
of an email.
|
||||
* Staging brick to convert a list of `Element`s to a `pandas` dataframe.
|
||||
* Add plain text functionality to `partition_email`
|
||||
|
||||
28
example-docs/fake-email-header.eml
Normal file
28
example-docs/fake-email-header.eml
Normal file
@ -0,0 +1,28 @@
|
||||
Received: from ABCDEFG-000.ABC.guide (00.0.0.00) by ABCDEFG-000.ABC.guide
|
||||
([ba23::58b5:2236:45g2:88h2]) with Unstructured TTTT Server (version=ABC0_0,
|
||||
cipher=ABC_ABCDE_ABC_NOPE_ABC_000_ABC_ABC000) id 00.0.000.0 via Techbox
|
||||
Transport; Wed, 20 Feb 2023 10:03:18 +1200
|
||||
MIME-Version: 1.0
|
||||
Date: Fri, 16 Dec 2022 17:04:16 -0500
|
||||
Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
|
||||
Subject: Test Email
|
||||
From: Matthew Robinson <mrobinson@unstructured.io>
|
||||
To: Matthew Robinson <mrobinson@unstructured.io>
|
||||
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
|
||||
|
||||
--00000000000095c9b205eff92630
|
||||
Content-Type: text/plain; charset="UTF-8"
|
||||
|
||||
This is a test email to use for unit tests.
|
||||
|
||||
Important points:
|
||||
|
||||
- Roses are red
|
||||
- Violets are blue
|
||||
|
||||
--00000000000095c9b205eff92630
|
||||
Content-Type: text/html; charset="UTF-8"
|
||||
|
||||
<div dir="ltr"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div>
|
||||
|
||||
--00000000000095c9b205eff92630--
|
||||
@ -1,14 +1,17 @@
|
||||
import datetime
|
||||
import email
|
||||
import os
|
||||
import pathlib
|
||||
import pytest
|
||||
|
||||
|
||||
from unstructured.documents.elements import NarrativeText, Title, ListItem, Image
|
||||
from unstructured.documents.email_elements import (
|
||||
MetaData,
|
||||
Recipient,
|
||||
Sender,
|
||||
Subject,
|
||||
ReceivedInfo,
|
||||
)
|
||||
from unstructured.partition.email import (
|
||||
extract_attachment_info,
|
||||
@ -36,6 +39,30 @@ IMAGE_EXPECTED_OUTPUT = [
|
||||
ListItem(text="Violets are blue"),
|
||||
]
|
||||
|
||||
RECEIVED_HEADER_OUTPUT = [
|
||||
ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="00.0.0.00"),
|
||||
ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="ba23::58b5:2236:45g2:88h2"),
|
||||
ReceivedInfo(
|
||||
name="received_datetimetz",
|
||||
text="2023-02-20 10:03:18+12:00",
|
||||
datestamp=datetime.datetime(
|
||||
2023, 2, 20, 10, 3, 18, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200))
|
||||
),
|
||||
),
|
||||
MetaData(name="MIME-Version", text="1.0"),
|
||||
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
|
||||
MetaData(
|
||||
name="Message-ID",
|
||||
text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
|
||||
),
|
||||
Subject(text="Test Email"),
|
||||
Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
|
||||
Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
|
||||
MetaData(
|
||||
name="Content-Type", text='multipart/alternative; boundary="00000000000095c9b205eff92630"'
|
||||
),
|
||||
]
|
||||
|
||||
HEADER_EXPECTED_OUTPUT = [
|
||||
MetaData(name="MIME-Version", text="1.0"),
|
||||
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
|
||||
@ -114,12 +141,12 @@ def test_partition_email_from_filename_with_embedded_image():
|
||||
|
||||
|
||||
def test_partition_email_header():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
|
||||
with open(filename, "r") as f:
|
||||
msg = email.message_from_file(f)
|
||||
elements = partition_email_header(msg)
|
||||
assert len(elements) > 0
|
||||
assert elements == HEADER_EXPECTED_OUTPUT
|
||||
assert elements == RECEIVED_HEADER_OUTPUT
|
||||
|
||||
|
||||
def test_extract_email_text_matches_html():
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.4.3-dev0" # pragma: no cover
|
||||
__version__ = "0.4.3-dev1" # pragma: no cover
|
||||
|
||||
@ -5,6 +5,12 @@ from typing import Callable, List, Union
|
||||
from unstructured.documents.elements import Element, Text, NoID
|
||||
|
||||
|
||||
class NoDatestamp(ABC):
|
||||
"""Class to indicate that an element do not have a datetime stamp."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class EmailElement(Element):
|
||||
"""An email element is a section of the email."""
|
||||
|
||||
@ -20,12 +26,11 @@ class Name(EmailElement):
|
||||
self,
|
||||
name: str,
|
||||
text: str,
|
||||
datestamp: Union[datetime, NoDatestamp] = NoDatestamp(),
|
||||
element_id: Union[str, NoID] = NoID(),
|
||||
):
|
||||
self.name: str = name
|
||||
self.text: str = text
|
||||
self.datestamp: datetime
|
||||
self.has_datestamp: bool = False
|
||||
|
||||
if isinstance(element_id, NoID):
|
||||
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
|
||||
@ -33,15 +38,17 @@ class Name(EmailElement):
|
||||
|
||||
super().__init__(element_id=element_id)
|
||||
|
||||
def set_datestamp(self, datestamp: datetime):
|
||||
self.datestamp = datestamp
|
||||
self.has_datestamp = True
|
||||
if isinstance(datestamp, datetime):
|
||||
self.datestamp: datetime = datestamp
|
||||
|
||||
def has_datestamp(self):
|
||||
return "self.datestamp" in globals()
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}: {self.text}"
|
||||
|
||||
def __eq__(self, other):
|
||||
if self.has_datestamp:
|
||||
if self.has_datestamp():
|
||||
return (
|
||||
self.name == other.name
|
||||
and self.text == other.text
|
||||
|
||||
@ -47,11 +47,8 @@ def _parse_received_data(data: str) -> List[Element]:
|
||||
elements.append(ReceivedInfo(name="mapi_id", text=mapi_id[0]))
|
||||
if datetimetz:
|
||||
elements.append(
|
||||
ReceivedInfo(name="received_datetimetz", text=str(datetimetz)).set_datestamp(
|
||||
datestamp=datetimetz
|
||||
)
|
||||
ReceivedInfo(name="received_datetimetz", text=str(datetimetz), datestamp=datetimetz)
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user