feat: Added EmailElement for email documents (#103)

* new EmailElement data structure
This commit is contained in:
Mallori Harrell 2022-12-21 16:03:44 -06:00 committed by GitHub
parent 4f6fc29b54
commit e0a76effff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 197 additions and 2 deletions

View File

@ -1,4 +1,4 @@
## 0.3.5-dev1
## 0.3.5-dev2
* Add new pattern to recognize plain text dash bullets
* Add test for bullet patterns
@ -15,6 +15,7 @@
* Adds the `partition_email` partitioning brick
* Adds the `replace_mime_encodings` cleaning bricks
* Small fix to HTML parsing related to processing list items with sub-tags
* Add `EmailElement` data structure to store email documents
## 0.3.2

View File

@ -0,0 +1,44 @@
from functools import partial
import pytest
from unstructured.cleaners.core import clean_prefix
from unstructured.cleaners.translate import translate_text
from unstructured.documents.email_elements import EmailElement, NoID, Name
def test_text_id():
name_element = Name(name="Example", text="hello there!")
assert name_element.id == "c69509590d81db2f37f9d75480c8efed"
def test_element_defaults_to_blank_id():
element = EmailElement()
assert isinstance(element.id, NoID)
def test_text_element_apply_cleaners():
name_element = Name(name="[2] Example docs", text="[1] A Textbook on Crocodile Habitats")
name_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
assert str(name_element) == "Example docs: A Textbook on Crocodile Habitats"
def test_name_element_apply_multiple_cleaners():
cleaners = [
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
partial(translate_text, target_lang="ru"),
]
name_element = Name(
name="[1] A Textbook on Crocodile Habitats", text="[1] A Textbook on Crocodile Habitats"
)
name_element.apply(*cleaners)
assert (
str(name_element)
== "Учебник по крокодильным средам обитания: Учебник по крокодильным средам обитания"
)
def test_apply_raises_if_func_does_not_produce_string():
name_element = Name(name="Example docs", text="[1] A Textbook on Crocodile Habitats")
with pytest.raises(ValueError):
name_element.apply(lambda s: 1)

View File

@ -1 +1 @@
__version__ = "0.3.5-dev1" # pragma: no cover
__version__ = "0.3.5-dev2" # pragma: no cover

View File

@ -0,0 +1,150 @@
from abc import ABC
import hashlib
from typing import Callable, List, Union
from unstructured.documents.elements import Element, Text, NoID
class EmailElement(Element):
"""An email element is a section of the email."""
pass
class Name(EmailElement):
"""Base element for capturing free text from within document."""
category = "Uncategorized"
def __init__(self, name: str, text: str, element_id: Union[str, NoID] = NoID()):
self.name: str = name
self.text: str = text
if isinstance(element_id, NoID):
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
element_id = hashlib.sha256(text.encode()).hexdigest()[:32]
super().__init__(element_id=element_id)
def __str__(self):
return f"{self.name}: {self.text}"
def __eq__(self, other):
return self.name == other.name and self.text == other.text
def apply(self, *cleaners: Callable):
"""Applies a cleaning brick to the text element. The function that's passed in
should take a string as input and produce a string as output."""
cleaned_text = self.text
cleaned_name = self.name
for cleaner in cleaners:
cleaned_text = cleaner(cleaned_text)
cleaned_name = cleaner(cleaned_name)
if not isinstance(cleaned_text, str):
raise ValueError("Cleaner produced a non-string output.")
if not isinstance(cleaned_name, str):
raise ValueError("Cleaner produced a non-string output.")
self.text = cleaned_text
self.name = cleaned_name
class BodyText(List[Text]):
"""BodyText is an element consisting of multiple, well-formulated sentences. This
excludes elements such titles, headers, footers, and captions. It is the body of an email."""
category = "BodyText"
pass
class Recipient(Text):
"""A text element for capturing the recipient information of an email (e.g. Subject,
To, From, etc)."""
category = "Recipient"
pass
class Sender(Text):
"""A text element for capturing the sender information of an email (e.g. Subject,
To, From, etc)."""
category = "Sender"
pass
class Subject(Text):
"""A text element for capturing the subject information of an email (e.g. Subject,
To, From, etc)."""
category = "Subject"
pass
class ReceivedInfo(List[Text]):
"""A text element for capturing header information of an email (e.g. Subject,
To, From, etc)."""
category = "ReceivedInfo"
pass
class MetaData(Name):
"""A text element for capturing header meta data of an email (e.g. Subject,
To, From, etc)."""
category = "MetaData"
pass
class Attachment(Name):
"""A text element for capturing the attachment name in an email (e.g. Subject,
To, From, etc)."""
category = "Attachment"
pass
class Email(ABC):
"""An email class with it's attributes"""
def __init__(self, recipient: Recipient, sender: Sender, subject: Subject, body: BodyText):
self.recipient = recipient
self.sender = sender
self.subject = subject
self.body = body
self.received_info: ReceivedInfo
self.meta_data: MetaData
self.attachment: List[Attachment]
def __str__(self):
return f"""
Recipient: {self.recipient}
Sender: {self.sender}
Subject: {self.subject}
Received Header Information:
{self.received_info}
Meta Data From Header:
{self.meta_data}
Body:
{self.body}
Attachment:
{[file.name for file in self.attachment]}
"""