mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 15:13:35 +00:00
feat: Added EmailElement for email documents (#103)
* new EmailElement data structure
This commit is contained in:
parent
4f6fc29b54
commit
e0a76effff
@ -1,4 +1,4 @@
|
||||
## 0.3.5-dev1
|
||||
## 0.3.5-dev2
|
||||
|
||||
* Add new pattern to recognize plain text dash bullets
|
||||
* Add test for bullet patterns
|
||||
@ -15,6 +15,7 @@
|
||||
* Adds the `partition_email` partitioning brick
|
||||
* Adds the `replace_mime_encodings` cleaning bricks
|
||||
* Small fix to HTML parsing related to processing list items with sub-tags
|
||||
* Add `EmailElement` data structure to store email documents
|
||||
|
||||
## 0.3.2
|
||||
|
||||
|
||||
44
test_unstructured/documents/test_email_elements.py
Normal file
44
test_unstructured/documents/test_email_elements.py
Normal file
@ -0,0 +1,44 @@
|
||||
from functools import partial
|
||||
import pytest
|
||||
|
||||
from unstructured.cleaners.core import clean_prefix
|
||||
from unstructured.cleaners.translate import translate_text
|
||||
from unstructured.documents.email_elements import EmailElement, NoID, Name
|
||||
|
||||
|
||||
def test_text_id():
|
||||
name_element = Name(name="Example", text="hello there!")
|
||||
assert name_element.id == "c69509590d81db2f37f9d75480c8efed"
|
||||
|
||||
|
||||
def test_element_defaults_to_blank_id():
|
||||
element = EmailElement()
|
||||
assert isinstance(element.id, NoID)
|
||||
|
||||
|
||||
def test_text_element_apply_cleaners():
|
||||
name_element = Name(name="[2] Example docs", text="[1] A Textbook on Crocodile Habitats")
|
||||
|
||||
name_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
|
||||
assert str(name_element) == "Example docs: A Textbook on Crocodile Habitats"
|
||||
|
||||
|
||||
def test_name_element_apply_multiple_cleaners():
|
||||
cleaners = [
|
||||
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
|
||||
partial(translate_text, target_lang="ru"),
|
||||
]
|
||||
name_element = Name(
|
||||
name="[1] A Textbook on Crocodile Habitats", text="[1] A Textbook on Crocodile Habitats"
|
||||
)
|
||||
name_element.apply(*cleaners)
|
||||
assert (
|
||||
str(name_element)
|
||||
== "Учебник по крокодильным средам обитания: Учебник по крокодильным средам обитания"
|
||||
)
|
||||
|
||||
|
||||
def test_apply_raises_if_func_does_not_produce_string():
|
||||
name_element = Name(name="Example docs", text="[1] A Textbook on Crocodile Habitats")
|
||||
with pytest.raises(ValueError):
|
||||
name_element.apply(lambda s: 1)
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.3.5-dev1" # pragma: no cover
|
||||
__version__ = "0.3.5-dev2" # pragma: no cover
|
||||
|
||||
150
unstructured/documents/email_elements.py
Normal file
150
unstructured/documents/email_elements.py
Normal file
@ -0,0 +1,150 @@
|
||||
from abc import ABC
|
||||
import hashlib
|
||||
from typing import Callable, List, Union
|
||||
from unstructured.documents.elements import Element, Text, NoID
|
||||
|
||||
|
||||
class EmailElement(Element):
|
||||
"""An email element is a section of the email."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Name(EmailElement):
|
||||
"""Base element for capturing free text from within document."""
|
||||
|
||||
category = "Uncategorized"
|
||||
|
||||
def __init__(self, name: str, text: str, element_id: Union[str, NoID] = NoID()):
|
||||
self.name: str = name
|
||||
self.text: str = text
|
||||
|
||||
if isinstance(element_id, NoID):
|
||||
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
|
||||
element_id = hashlib.sha256(text.encode()).hexdigest()[:32]
|
||||
|
||||
super().__init__(element_id=element_id)
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}: {self.text}"
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.name == other.name and self.text == other.text
|
||||
|
||||
def apply(self, *cleaners: Callable):
|
||||
"""Applies a cleaning brick to the text element. The function that's passed in
|
||||
should take a string as input and produce a string as output."""
|
||||
cleaned_text = self.text
|
||||
cleaned_name = self.name
|
||||
|
||||
for cleaner in cleaners:
|
||||
cleaned_text = cleaner(cleaned_text)
|
||||
cleaned_name = cleaner(cleaned_name)
|
||||
|
||||
if not isinstance(cleaned_text, str):
|
||||
raise ValueError("Cleaner produced a non-string output.")
|
||||
|
||||
if not isinstance(cleaned_name, str):
|
||||
raise ValueError("Cleaner produced a non-string output.")
|
||||
|
||||
self.text = cleaned_text
|
||||
self.name = cleaned_name
|
||||
|
||||
|
||||
class BodyText(List[Text]):
|
||||
"""BodyText is an element consisting of multiple, well-formulated sentences. This
|
||||
excludes elements such titles, headers, footers, and captions. It is the body of an email."""
|
||||
|
||||
category = "BodyText"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Recipient(Text):
|
||||
"""A text element for capturing the recipient information of an email (e.g. Subject,
|
||||
To, From, etc)."""
|
||||
|
||||
category = "Recipient"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Sender(Text):
|
||||
"""A text element for capturing the sender information of an email (e.g. Subject,
|
||||
To, From, etc)."""
|
||||
|
||||
category = "Sender"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Subject(Text):
|
||||
"""A text element for capturing the subject information of an email (e.g. Subject,
|
||||
To, From, etc)."""
|
||||
|
||||
category = "Subject"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ReceivedInfo(List[Text]):
|
||||
"""A text element for capturing header information of an email (e.g. Subject,
|
||||
To, From, etc)."""
|
||||
|
||||
category = "ReceivedInfo"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class MetaData(Name):
|
||||
"""A text element for capturing header meta data of an email (e.g. Subject,
|
||||
To, From, etc)."""
|
||||
|
||||
category = "MetaData"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Attachment(Name):
|
||||
"""A text element for capturing the attachment name in an email (e.g. Subject,
|
||||
To, From, etc)."""
|
||||
|
||||
category = "Attachment"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Email(ABC):
|
||||
"""An email class with it's attributes"""
|
||||
|
||||
def __init__(self, recipient: Recipient, sender: Sender, subject: Subject, body: BodyText):
|
||||
self.recipient = recipient
|
||||
self.sender = sender
|
||||
self.subject = subject
|
||||
self.body = body
|
||||
self.received_info: ReceivedInfo
|
||||
self.meta_data: MetaData
|
||||
self.attachment: List[Attachment]
|
||||
|
||||
def __str__(self):
|
||||
return f"""
|
||||
Recipient: {self.recipient}
|
||||
Sender: {self.sender}
|
||||
Subject: {self.subject}
|
||||
|
||||
Received Header Information:
|
||||
|
||||
{self.received_info}
|
||||
|
||||
Meta Data From Header:
|
||||
|
||||
{self.meta_data}
|
||||
|
||||
Body:
|
||||
|
||||
{self.body}
|
||||
|
||||
Attachment:
|
||||
|
||||
{[file.name for file in self.attachment]}
|
||||
"""
|
||||
Loading…
x
Reference in New Issue
Block a user