mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-03 11:34:07 +00:00
feat: Add new functionality to parse text and header of emails (#111)
* partition_text function
This commit is contained in:
parent
7fb8713527
commit
d7a00046a9
@ -1,10 +1,14 @@
|
||||
## 0.3.6-dev1
|
||||
## 0.3.6-dev2
|
||||
|
||||
* Cleaning brick for removing ordered bullets `clean_ordered_bullets`.
|
||||
* Extract brick method for ordered bullets `extract_ordered_bullets`.
|
||||
* Test for `clean_ordered_bullets`.
|
||||
* Test for `extract_ordered_bullets`.
|
||||
* Added `partition_docx` for pre-processing Word Documents.
|
||||
* Added new REGEX patterns to extract email header information
|
||||
* Added new functions to extract header information `parse_received_data` and `partition_header`
|
||||
* Added new function to parse plain text files `partition_text`
|
||||
* Added new cleaners functions `extract_ip_address`, `extract_ip_address_name`, `extract_mapi_id`, `extract_datetimetz`
|
||||
|
||||
## 0.3.5
|
||||
|
||||
@ -18,6 +22,7 @@
|
||||
* Add new function `extract_attachment_info` that extracts and decode the attachment
|
||||
of an email.
|
||||
* Staging brick to convert a list of `Element`s to a `pandas` dataframe.
|
||||
* Add plain text functionality to `partition_email`
|
||||
|
||||
## 0.3.4
|
||||
|
||||
|
||||
45
README.md
45
README.md
@ -190,6 +190,51 @@ Roses are red
|
||||
Violets are blue
|
||||
```
|
||||
|
||||
### Text Document Parsing
|
||||
|
||||
The `partition_text` function within `unstructured` can be used to parse simple
|
||||
text files into elements.
|
||||
|
||||
`partition_text` accepts filenames, file-like object, and raw text as input. The following three snippets are for parsing text files:
|
||||
|
||||
```python
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
elements = partition_text(filename="example-docs/fake-text.txt")
|
||||
|
||||
with open("example-docs/fake-text.txt", "r") as f:
|
||||
elements = partition_text(file=f)
|
||||
|
||||
with open("example-docs/fake-text.txt", "r") as f:
|
||||
text = f.read()
|
||||
elements = partition_text(text=text)
|
||||
```
|
||||
|
||||
The `elements` output will look like the following:
|
||||
|
||||
```python
|
||||
[<unstructured.documents.html.HTMLNarrativeText at 0x13ab14370>,
|
||||
<unstructured.documents.html.HTMLTitle at 0x106877970>,
|
||||
<unstructured.documents.html.HTMLListItem at 0x1068776a0>,
|
||||
<unstructured.documents.html.HTMLListItem at 0x13fe4b0a0>]
|
||||
```
|
||||
|
||||
Run `print("\n\n".join([str(el) for el in elements]))` to get a string representation of the
|
||||
output, which looks like:
|
||||
|
||||
```python
|
||||
This is a test document to use for unit tests.
|
||||
|
||||
Important points:
|
||||
|
||||
Hamburgers are delicious
|
||||
|
||||
Dogs are the best
|
||||
|
||||
I love fuzzy blankets
|
||||
```
|
||||
|
||||
|
||||
## :guardsman: Security Policy
|
||||
|
||||
See our [security policy](https://github.com/Unstructured-IO/unstructured/security/policy) for
|
||||
|
||||
@ -90,7 +90,11 @@ Examples:
|
||||
The ``partition_email`` function partitions ``.eml`` documents and works with exports
|
||||
from email clients such as Microsoft Outlook and Gmail. The ``partition_email``
|
||||
takes a filename, file-like object, or raw text as input and produces a list of
|
||||
document ``Element`` objects as output.
|
||||
document ``Element`` objects as output. Also ``content_source`` can be set to ``text/html``
|
||||
(default) or ``text/plain`` to process the html or plain text version of the email, respectively.
|
||||
In order for ``partition_email`` to also return the header information (e.g. sender, recipient,
|
||||
attachment, etc.), ``include_headers`` must be set to ``True``. Returns tuple with body elements
|
||||
first and header elements second, if ``include_headers`` is True.
|
||||
|
||||
Examples:
|
||||
|
||||
@ -107,6 +111,37 @@ Examples:
|
||||
text = f.read()
|
||||
elements = partition_email(text=text)
|
||||
|
||||
with open("example-docs/fake-email.eml", "r") as f:
|
||||
text = f.read()
|
||||
elements = partition_email(text=text, content_source="text/plain")
|
||||
|
||||
with open("example-docs/fake-email.eml", "r") as f:
|
||||
text = f.read()
|
||||
elements = partition_email(text=text, include_headers=True)
|
||||
|
||||
|
||||
``partition_text``
|
||||
---------------------
|
||||
|
||||
The ``partition_text`` function partitions text files. The ``partition_text``
|
||||
takes a filename, file-like object, and raw text as input and produces ``Element`` objects as output.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
elements = partition_text(filename="example-docs/fake-text.txt")
|
||||
|
||||
with open("example-docs/fake-text.txt", "r") as f:
|
||||
elements = partition_text(file=f)
|
||||
|
||||
with open("example-docs/fake-text.txt", "r") as f:
|
||||
text = f.read()
|
||||
elements = partition_text(text=text)
|
||||
|
||||
|
||||
``extract_attachment_info``
|
||||
----------------------------
|
||||
|
||||
@ -550,6 +585,96 @@ Examples:
|
||||
# Returns "Look at me, I'm flying!"
|
||||
extract_text_after(text, r"SPEAKER \d{1}:")
|
||||
|
||||
``extract_email_address``
|
||||
--------------------------
|
||||
|
||||
Extracts email addresses from a string input and returns a list of all the email
|
||||
addresses in the input string.
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.cleaners.extract import extract_email_address
|
||||
|
||||
text = """Me me@email.com and You <You@email.com>
|
||||
([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""
|
||||
|
||||
# Returns "['me@email.com', 'you@email.com']"
|
||||
extract_email_address(text)
|
||||
|
||||
|
||||
``extract_ip_address``
|
||||
------------------------
|
||||
|
||||
Extracts IPv4 and IPv6 IP addresses in the input string and
|
||||
returns a list of all IP address in input string.
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.cleaners.extract import extract_ip_address
|
||||
|
||||
text = """Me me@email.com and You <You@email.com>
|
||||
([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""
|
||||
|
||||
# Returns "['ba23::58b5:2236:45g2:88h2', '10.0.2.01']"
|
||||
extract_ip_address(text)
|
||||
|
||||
|
||||
``extract_ip_address_name``
|
||||
----------------------------
|
||||
|
||||
Extracts the names of each IP address in the ``Received`` field(s) from an ``.eml``
|
||||
file. ``extract_ip_address_name`` takes in a string and returns a list of all
|
||||
IP addresses in the input string.
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.cleaners.extract import extract_ip_address_name
|
||||
|
||||
text = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by
|
||||
\n ABC.DEF.local2 ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\
|
||||
n 32.88.5467.123; Fri, 26 Mar 2021 11:04:09 +1200"""
|
||||
|
||||
# Returns "['ABC.DEF.local', 'ABC.DEF.local2']"
|
||||
extract_ip_address_name(text)
|
||||
|
||||
|
||||
``extract_mapi_id``
|
||||
----------------------
|
||||
|
||||
Extracts the ``mapi id`` in the ``Received`` field(s) from an ``.eml``
|
||||
file. ``extract_mapi_id`` takes in a string and returns a list of a string
|
||||
containing the ``mapi id`` in the input string.
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.cleaners.extract import extract_mapi_id
|
||||
|
||||
text = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by
|
||||
\n ABC.DEF.local2 ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\
|
||||
n 32.88.5467.123; Fri, 26 Mar 2021 11:04:09 +1200"""
|
||||
|
||||
# Returns "['32.88.5467.123']"
|
||||
extract_mapi_id(text)
|
||||
|
||||
|
||||
``extract_datetimetz``
|
||||
----------------------
|
||||
|
||||
Extracts the date, time, and timezone in the ``Received`` field(s) from an ``.eml``
|
||||
file. ``extract_datetimetz`` takes in a string and returns a datetime.datetime
|
||||
object from the input string.
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.cleaners.extract import extract_datetimetz
|
||||
|
||||
text = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by
|
||||
\n ABC.DEF.local2 ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\
|
||||
n 32.88.5467.123; Fri, 26 Mar 2021 11:04:09 +1200"""
|
||||
|
||||
# Returns datetime.datetime(2021, 3, 26, 11, 4, 9, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)))
|
||||
extract_datetimetz(text)
|
||||
|
||||
|
||||
``extract_us_phone_number``
|
||||
---------------------------
|
||||
|
||||
24
example-docs/fake-email.txt
Normal file
24
example-docs/fake-email.txt
Normal file
@ -0,0 +1,24 @@
|
||||
MIME-Version: 1.0
|
||||
Date: Fri, 16 Dec 2022 17:04:16 -0500
|
||||
Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
|
||||
Subject: Test Email
|
||||
From: Matthew Robinson <mrobinson@unstructured.io>
|
||||
To: Matthew Robinson <mrobinson@unstructured.io>
|
||||
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
|
||||
|
||||
--00000000000095c9b205eff92630
|
||||
Content-Type: text/plain; charset="UTF-8"
|
||||
|
||||
This is a test email to use for unit tests.
|
||||
|
||||
Important points:
|
||||
|
||||
- Roses are red
|
||||
- Violets are blue
|
||||
|
||||
--00000000000095c9b205eff92630
|
||||
Content-Type: text/html; charset="UTF-8"
|
||||
|
||||
<div dir="ltr"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div>
|
||||
|
||||
--00000000000095c9b205eff92630--
|
||||
7
example-docs/fake-text.txt
Normal file
7
example-docs/fake-text.txt
Normal file
@ -0,0 +1,7 @@
|
||||
This is a test document to use for unit tests.
|
||||
|
||||
Important points:
|
||||
|
||||
- Hamburgers are delicious
|
||||
- Dogs are the best
|
||||
- I love fuzzy blankets
|
||||
66
requirements.txt
Normal file
66
requirements.txt
Normal file
@ -0,0 +1,66 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.9
|
||||
# To update, run:
|
||||
#
|
||||
# pip-compile
|
||||
#
|
||||
argilla==1.1.1
|
||||
# via unstructured (setup.py)
|
||||
backoff==2.2.1
|
||||
# via argilla
|
||||
certifi==2022.12.7
|
||||
# via httpx
|
||||
click==8.1.3
|
||||
# via nltk
|
||||
deprecated==1.2.13
|
||||
# via argilla
|
||||
h11==0.9.0
|
||||
# via httpcore
|
||||
httpcore==0.11.1
|
||||
# via httpx
|
||||
httpx==0.15.5
|
||||
# via argilla
|
||||
idna==3.4
|
||||
# via rfc3986
|
||||
joblib==1.2.0
|
||||
# via nltk
|
||||
lxml==4.9.2
|
||||
# via unstructured (setup.py)
|
||||
monotonic==1.6
|
||||
# via argilla
|
||||
nltk==3.8
|
||||
# via unstructured (setup.py)
|
||||
numpy==1.23.5
|
||||
# via
|
||||
# argilla
|
||||
# pandas
|
||||
packaging==22.0
|
||||
# via argilla
|
||||
pandas==1.5.2
|
||||
# via argilla
|
||||
pydantic==1.10.2
|
||||
# via argilla
|
||||
python-dateutil==2.8.2
|
||||
# via pandas
|
||||
pytz==2022.6
|
||||
# via pandas
|
||||
regex==2022.10.31
|
||||
# via nltk
|
||||
rfc3986[idna2008]==1.5.0
|
||||
# via httpx
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
sniffio==1.3.0
|
||||
# via
|
||||
# httpcore
|
||||
# httpx
|
||||
tqdm==4.64.1
|
||||
# via
|
||||
# argilla
|
||||
# nltk
|
||||
typing-extensions==4.4.0
|
||||
# via pydantic
|
||||
wrapt==1.13.3
|
||||
# via
|
||||
# argilla
|
||||
# deprecated
|
||||
@ -1,7 +1,12 @@
|
||||
import pytest
|
||||
import datetime
|
||||
|
||||
import unstructured.cleaners.extract as extract
|
||||
|
||||
EMAIL_META_DATA_INPUT = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by
|
||||
\n ABC.DEF.local ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\
|
||||
n 32.88.5467.123; Fri, 26 Mar 2021 11:04:09 +1200"""
|
||||
|
||||
|
||||
def test_get_indexed_match_raises_with_bad_index():
|
||||
with pytest.raises(ValueError):
|
||||
@ -23,6 +28,35 @@ def test_extract_text_after():
|
||||
assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!"
|
||||
|
||||
|
||||
def test_extract_email_address():
|
||||
text = "Im Rabn <Im.Rabn@npf.gov.nr>"
|
||||
assert extract.extract_email_address(text) == ["im.rabn@npf.gov.nr"]
|
||||
|
||||
|
||||
def test_extract_ip_address():
|
||||
assert extract.extract_ip_address(EMAIL_META_DATA_INPUT) == [
|
||||
"ba23::58b5:2236:45g2:88h2",
|
||||
"ba23::58b5:2236:45g2:88h2%25",
|
||||
]
|
||||
|
||||
|
||||
def test_extract_ip_address_name():
|
||||
assert extract.extract_ip_address_name(EMAIL_META_DATA_INPUT) == [
|
||||
"ABC.DEF.local",
|
||||
"ABC.DEF.local",
|
||||
]
|
||||
|
||||
|
||||
def test_extract_mapi_id():
|
||||
assert extract.extract_mapi_id(EMAIL_META_DATA_INPUT) == ["32.88.5467.123"]
|
||||
|
||||
|
||||
def test_extract_datetimetz():
|
||||
assert extract.extract_datetimetz(EMAIL_META_DATA_INPUT) == datetime.datetime(
|
||||
2021, 3, 26, 11, 4, 9, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200))
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
|
||||
@ -4,7 +4,17 @@ import pathlib
|
||||
import pytest
|
||||
|
||||
from unstructured.documents.elements import NarrativeText, Title, ListItem
|
||||
from unstructured.partition.email import partition_email, extract_attachment_info
|
||||
from unstructured.documents.email_elements import (
|
||||
MetaData,
|
||||
Recipient,
|
||||
Sender,
|
||||
Subject,
|
||||
)
|
||||
from unstructured.partition.email import (
|
||||
extract_attachment_info,
|
||||
partition_email,
|
||||
partition_email_header,
|
||||
)
|
||||
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
@ -17,6 +27,23 @@ EXPECTED_OUTPUT = [
|
||||
ListItem(text="Violets are blue"),
|
||||
]
|
||||
|
||||
HEADER_EXPECTED_OUTPUT = [
|
||||
MetaData(name="MIME-Version", text="1.0"),
|
||||
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
|
||||
MetaData(
|
||||
name="Message-ID",
|
||||
text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
|
||||
),
|
||||
Subject(text="Test Email"),
|
||||
Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
|
||||
Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
|
||||
MetaData(
|
||||
name="Content-Type", text='multipart/alternative; boundary="00000000000095c9b205eff92630"'
|
||||
),
|
||||
]
|
||||
|
||||
ALL_EXPECTED_OUTPUT = HEADER_EXPECTED_OUTPUT + EXPECTED_OUTPUT
|
||||
|
||||
ATTACH_EXPECTED_OUTPUT = [
|
||||
{"filename": "fake-attachment.txt", "payload": b"Hey this is a fake attachment!"}
|
||||
]
|
||||
@ -37,6 +64,22 @@ def test_partition_email_from_file():
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_email_from_text_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.txt")
|
||||
with open(filename, "r") as f:
|
||||
elements = partition_email(file=f, content_source="text/plain")
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_email_from_text_file_with_headers():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.txt")
|
||||
with open(filename, "r") as f:
|
||||
elements = partition_email(file=f, content_source="text/plain", include_headers=True)
|
||||
assert len(elements) > 0
|
||||
assert elements == ALL_EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_email_from_text():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
|
||||
with open(filename, "r") as f:
|
||||
@ -46,6 +89,15 @@ def test_partition_email_from_text():
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_email_header():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
|
||||
with open(filename, "r") as f:
|
||||
msg = email.message_from_file(f)
|
||||
elements = partition_email_header(msg)
|
||||
assert len(elements) > 0
|
||||
assert elements == HEADER_EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_extract_attachment_info():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
|
||||
with open(filename, "r") as f:
|
||||
|
||||
54
test_unstructured/partition/test_text.py
Normal file
54
test_unstructured/partition/test_text.py
Normal file
@ -0,0 +1,54 @@
|
||||
import os
|
||||
import pathlib
|
||||
import pytest
|
||||
|
||||
from unstructured.documents.elements import NarrativeText, Title, ListItem
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
EXPECTED_OUTPUT = [
|
||||
NarrativeText(text="This is a test document to use for unit tests."),
|
||||
Title(text="Important points:"),
|
||||
ListItem(text="Hamburgers are delicious"),
|
||||
ListItem(text="Dogs are the best"),
|
||||
ListItem(text="I love fuzzy blankets"),
|
||||
]
|
||||
|
||||
|
||||
def test_partition_email_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
elements = partition_text(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_email_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
with open(filename, "r") as f:
|
||||
elements = partition_text(file=f)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_email_from_text():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
with open(filename, "r") as f:
|
||||
text = f.read()
|
||||
elements = partition_text(text=text)
|
||||
assert len(elements) > 0
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_email_raises_with_none_specified():
|
||||
with pytest.raises(ValueError):
|
||||
partition_text()
|
||||
|
||||
|
||||
def test_partition_email_raises_with_too_many_specified():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||
with open(filename, "r") as f:
|
||||
text = f.read()
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
partition_text(filename=filename, text=text)
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.3.6-dev1" # pragma: no cover
|
||||
__version__ = "0.3.6-dev2" # pragma: no cover
|
||||
|
||||
@ -1,4 +1,13 @@
|
||||
import re
|
||||
import datetime
|
||||
from typing import List
|
||||
from unstructured.nlp.patterns import (
|
||||
IP_ADDRESS_PATTERN_RE,
|
||||
IP_ADDRESS_NAME_PATTERN,
|
||||
MAPI_ID_PATTERN,
|
||||
EMAIL_DATETIMETZ_PATTERN,
|
||||
EMAIL_ADDRESS_PATTERN,
|
||||
)
|
||||
|
||||
from unstructured.nlp.patterns import US_PHONE_NUMBERS_RE
|
||||
|
||||
@ -48,6 +57,29 @@ def extract_text_after(text: str, pattern: str, index: int = 0, strip: bool = Tr
|
||||
return before_text.lstrip() if strip else before_text
|
||||
|
||||
|
||||
def extract_email_address(text: str) -> List[str]:
|
||||
return re.findall(EMAIL_ADDRESS_PATTERN, text.lower())
|
||||
|
||||
|
||||
def extract_ip_address(text: str) -> List[str]:
|
||||
return re.findall(IP_ADDRESS_PATTERN_RE, text)
|
||||
|
||||
|
||||
def extract_ip_address_name(text: str) -> List[str]:
|
||||
return re.findall(IP_ADDRESS_NAME_PATTERN, text)
|
||||
|
||||
|
||||
def extract_mapi_id(text: str) -> List[str]:
|
||||
mapi_ids = re.findall(MAPI_ID_PATTERN, text)
|
||||
mapi_ids = [mid.replace(";", "") for mid in mapi_ids]
|
||||
return mapi_ids
|
||||
|
||||
|
||||
def extract_datetimetz(text: str) -> datetime.datetime:
|
||||
date_string = re.findall(EMAIL_DATETIMETZ_PATTERN, text)
|
||||
return datetime.datetime.strptime(date_string[0], "%a, %d %b %Y %H:%M:%S %z")
|
||||
|
||||
|
||||
def extract_us_phone_number(text: str):
|
||||
"""Extracts a US phone number from a section of text that includes a phone number. If there
|
||||
is no phone number present, the result will be an empty string.
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from abc import ABC
|
||||
from datetime import datetime
|
||||
import hashlib
|
||||
from typing import Callable, List, Union
|
||||
from unstructured.documents.elements import Element, Text, NoID
|
||||
@ -15,9 +16,16 @@ class Name(EmailElement):
|
||||
|
||||
category = "Uncategorized"
|
||||
|
||||
def __init__(self, name: str, text: str, element_id: Union[str, NoID] = NoID()):
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
text: str,
|
||||
element_id: Union[str, NoID] = NoID(),
|
||||
):
|
||||
self.name: str = name
|
||||
self.text: str = text
|
||||
self.datestamp: datetime
|
||||
self.has_datestamp: bool = False
|
||||
|
||||
if isinstance(element_id, NoID):
|
||||
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
|
||||
@ -25,10 +33,20 @@ class Name(EmailElement):
|
||||
|
||||
super().__init__(element_id=element_id)
|
||||
|
||||
def set_datestamp(self, datestamp: datetime):
|
||||
self.datestamp = datestamp
|
||||
self.has_datestamp = True
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name}: {self.text}"
|
||||
|
||||
def __eq__(self, other):
|
||||
if self.has_datestamp:
|
||||
return (
|
||||
self.name == other.name
|
||||
and self.text == other.text
|
||||
and self.datestamp == other.datestamp
|
||||
)
|
||||
return self.name == other.name and self.text == other.text
|
||||
|
||||
def apply(self, *cleaners: Callable):
|
||||
@ -60,54 +78,50 @@ class BodyText(List[Text]):
|
||||
pass
|
||||
|
||||
|
||||
class Recipient(Text):
|
||||
"""A text element for capturing the recipient information of an email (e.g. Subject,
|
||||
To, From, etc)."""
|
||||
class Recipient(Name):
|
||||
"""A text element for capturing the recipient information of an email"""
|
||||
|
||||
category = "Recipient"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Sender(Text):
|
||||
"""A text element for capturing the sender information of an email (e.g. Subject,
|
||||
To, From, etc)."""
|
||||
class Sender(Name):
|
||||
"""A text element for capturing the sender information of an email"""
|
||||
|
||||
category = "Sender"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Subject(Text):
|
||||
"""A text element for capturing the subject information of an email (e.g. Subject,
|
||||
To, From, etc)."""
|
||||
class Subject(Text, EmailElement):
|
||||
"""A text element for capturing the subject information of an email"""
|
||||
|
||||
category = "Subject"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ReceivedInfo(List[Text]):
|
||||
"""A text element for capturing header information of an email (e.g. Subject,
|
||||
To, From, etc)."""
|
||||
|
||||
category = "ReceivedInfo"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class MetaData(Name):
|
||||
"""A text element for capturing header meta data of an email (e.g. Subject,
|
||||
To, From, etc)."""
|
||||
"""A text element for capturing header meta data of an email
|
||||
(miscellaneous data in the email)."""
|
||||
|
||||
category = "MetaData"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ReceivedInfo(Name):
|
||||
"""A text element for capturing header information of an email (e.g. IP addresses, etc)."""
|
||||
|
||||
category = "ReceivedInfo"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Attachment(Name):
|
||||
"""A text element for capturing the attachment name in an email (e.g. Subject,
|
||||
To, From, etc)."""
|
||||
"""A text element for capturing the attachment name in an email (e.g. documents,
|
||||
images, etc)."""
|
||||
|
||||
category = "Attachment"
|
||||
|
||||
@ -117,11 +131,11 @@ class Attachment(Name):
|
||||
class Email(ABC):
|
||||
"""An email class with it's attributes"""
|
||||
|
||||
def __init__(self, recipient: Recipient, sender: Sender, subject: Subject, body: BodyText):
|
||||
self.recipient = recipient
|
||||
self.sender = sender
|
||||
self.subject = subject
|
||||
self.body = body
|
||||
def __init__(self):
|
||||
self.recipient = Recipient
|
||||
self.sender = Sender
|
||||
self.subject = Subject
|
||||
self.body = BodyText
|
||||
self.received_info: ReceivedInfo
|
||||
self.meta_data: MetaData
|
||||
self.attachment: List[Attachment]
|
||||
|
||||
@ -41,3 +41,28 @@ UNICODE_BULLETS: Final[List[str]] = [
|
||||
"·",
|
||||
]
|
||||
UNICODE_BULLETS_RE = re.compile(f"({'|'.join(UNICODE_BULLETS)})")
|
||||
|
||||
# Helps split text by paragraphs
|
||||
PARAGRAPH_PATTERN = "\n\n\n|\n\n|\r\n|\r|\n" # noqa: W605 NOTE(harrell)
|
||||
|
||||
# IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01
|
||||
IP_ADDRESS_PATTERN = (
|
||||
"[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}", # noqa: W605 NOTE(harrell)
|
||||
# - skipping qa because we need the escape for the regex
|
||||
"[a-z0-9]{4}::[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}%?[0-9]*",
|
||||
)
|
||||
IP_ADDRESS_PATTERN_RE = re.compile(f"({'|'.join(IP_ADDRESS_PATTERN)})")
|
||||
|
||||
IP_ADDRESS_NAME_PATTERN = "[a-zA-Z0-9-]*\.[a-zA-Z]*\.[a-zA-Z]*" # noqa: W605 NOTE(harrell)
|
||||
# - skipping qa because we need the escape for the regex
|
||||
|
||||
# Mapi ID example: 32.88.5467.123
|
||||
MAPI_ID_PATTERN = "[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*;" # noqa: W605 NOTE(harrell)
|
||||
# - skipping qa because we need the escape for the regex
|
||||
|
||||
# Date, time, timezone example: Fri, 26 Mar 2021 11:04:09 +1200
|
||||
EMAIL_DATETIMETZ_PATTERN = "[a-zA-z]{3},\s[0-9]{2}\s[a-zA-Z]{3}\s[0-9]{4}\s[0-9]{2}:[0-9]{2}:[0-9]{2}\s[+0-9]{5}" # noqa: W605,E501
|
||||
# NOTE(harrell) - skipping qa because we need the escape for the regex
|
||||
|
||||
EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell)
|
||||
# - skipping qa because we need the escape for the regex
|
||||
|
||||
@ -1,7 +1,8 @@
|
||||
import email
|
||||
import sys
|
||||
import re
|
||||
from email.message import Message
|
||||
from typing import Dict, IO, List, Optional
|
||||
from typing import Dict, IO, List, Optional, Tuple
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
from typing_extensions import Final
|
||||
@ -9,11 +10,77 @@ else:
|
||||
from typing import Final
|
||||
|
||||
from unstructured.cleaners.core import replace_mime_encodings, clean_extra_whitespace
|
||||
from unstructured.cleaners.extract import (
|
||||
extract_ip_address,
|
||||
extract_ip_address_name,
|
||||
extract_mapi_id,
|
||||
extract_datetimetz,
|
||||
extract_email_address,
|
||||
)
|
||||
from unstructured.documents.email_elements import (
|
||||
Recipient,
|
||||
Sender,
|
||||
Subject,
|
||||
ReceivedInfo,
|
||||
MetaData,
|
||||
)
|
||||
from unstructured.documents.elements import Element, Text
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.text import split_by_paragraph, partition_text
|
||||
|
||||
|
||||
VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html"]
|
||||
VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"]
|
||||
|
||||
|
||||
def _parse_received_data(data: str) -> List[Element]:
|
||||
|
||||
ip_address_names = extract_ip_address_name(data)
|
||||
ip_addresses = extract_ip_address(data)
|
||||
mapi_id = extract_mapi_id(data)
|
||||
datetimetz = extract_datetimetz(data)
|
||||
|
||||
elements: List[Element] = list()
|
||||
if ip_address_names and ip_addresses:
|
||||
for name, ip in zip(ip_address_names, ip_addresses):
|
||||
elements.append(ReceivedInfo(name=name, text=ip))
|
||||
if mapi_id:
|
||||
elements.append(ReceivedInfo(name="mapi_id", text=mapi_id[0]))
|
||||
if datetimetz:
|
||||
elements.append(
|
||||
ReceivedInfo(name="received_datetimetz", text=str(datetimetz)).set_datestamp(
|
||||
datestamp=datetimetz
|
||||
)
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
def _parse_email_address(data: str) -> Tuple[str, str]:
|
||||
email_address = extract_email_address(data)
|
||||
|
||||
PATTERN = "<[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+>" # noqa: W605 Note(harrell)
|
||||
name = re.split(PATTERN, data.lower())[0].title().strip()
|
||||
|
||||
return name, email_address[0]
|
||||
|
||||
|
||||
def partition_email_header(msg: Message) -> List[Element]:
|
||||
elements: List[Element] = list()
|
||||
for item in msg.raw_items():
|
||||
if item[0] == "To":
|
||||
text = _parse_email_address(item[1])
|
||||
elements.append(Recipient(name=text[0], text=text[1]))
|
||||
elif item[0] == "From":
|
||||
text = _parse_email_address(item[1])
|
||||
elements.append(Sender(name=text[0], text=text[1]))
|
||||
elif item[0] == "Subject":
|
||||
elements.append(Subject(text=item[1]))
|
||||
elif item[0] == "Received":
|
||||
elements += _parse_received_data(item[1])
|
||||
else:
|
||||
elements.append(MetaData(name=item[0], text=item[1]))
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
def extract_attachment_info(
|
||||
@ -40,7 +107,7 @@ def extract_attachment_info(
|
||||
if output_dir:
|
||||
filename = output_dir + "/" + attachment["filename"]
|
||||
with open(filename, "wb") as f:
|
||||
# mypy wants to just us `w` when opening the file but this
|
||||
# Note(harrell) mypy wants to just us `w` when opening the file but this
|
||||
# causes an error since the payloads are bytes not str
|
||||
f.write(attachment["payload"]) # type: ignore
|
||||
return list_attachments
|
||||
@ -51,6 +118,7 @@ def partition_email(
|
||||
file: Optional[IO] = None,
|
||||
text: Optional[str] = None,
|
||||
content_source: str = "text/html",
|
||||
include_headers: bool = False,
|
||||
) -> List[Element]:
|
||||
"""Partitions an .eml documents into its constituent elements.
|
||||
Parameters
|
||||
@ -61,6 +129,9 @@ def partition_email(
|
||||
A file-like object using "r" mode --> open(filename, "r").
|
||||
text
|
||||
The string representation of the .eml document.
|
||||
content_source
|
||||
default: "text/html"
|
||||
other: "text/plain"
|
||||
"""
|
||||
if content_source not in VALID_CONTENT_SOURCES:
|
||||
raise ValueError(
|
||||
@ -92,7 +163,7 @@ def partition_email(
|
||||
|
||||
content = content_map.get(content_source, "")
|
||||
if not content:
|
||||
raise ValueError("text/html content not found in email")
|
||||
raise ValueError(f"{content_source} content not found in email")
|
||||
|
||||
# NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
|
||||
# looks like the following, resulting in extraneous "=" chracters in the output if
|
||||
@ -101,11 +172,19 @@ def partition_email(
|
||||
# <li>Item 1</li>=
|
||||
# <li>Item 2<li>=
|
||||
# </ul>
|
||||
content = "".join(content.split("=\n"))
|
||||
list_content = split_by_paragraph(content)
|
||||
|
||||
elements = partition_html(text=content)
|
||||
for element in elements:
|
||||
if isinstance(element, Text):
|
||||
element.apply(replace_mime_encodings)
|
||||
if content_source == "text/html":
|
||||
content = "".join(list_content)
|
||||
elements = partition_html(text=content)
|
||||
for element in elements:
|
||||
if isinstance(element, Text):
|
||||
element.apply(replace_mime_encodings)
|
||||
elif content_source == "text/plain":
|
||||
elements = partition_text(text=content)
|
||||
|
||||
return elements
|
||||
header: List[Element] = list()
|
||||
if include_headers:
|
||||
header = partition_email_header(msg)
|
||||
all_elements = header + elements
|
||||
return all_elements
|
||||
|
||||
66
unstructured/partition/text.py
Normal file
66
unstructured/partition/text.py
Normal file
@ -0,0 +1,66 @@
|
||||
import re
|
||||
from typing import IO, List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element, ListItem, NarrativeText, Title
|
||||
|
||||
from unstructured.cleaners.core import clean_bullets
|
||||
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
|
||||
from unstructured.partition.text_type import (
|
||||
is_possible_narrative_text,
|
||||
is_possible_title,
|
||||
is_bulleted_text,
|
||||
)
|
||||
|
||||
|
||||
def split_by_paragraph(content: str) -> List[str]:
|
||||
return re.split(PARAGRAPH_PATTERN, content)
|
||||
|
||||
|
||||
def partition_text(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
text: Optional[str] = None,
|
||||
) -> List[Element]:
|
||||
"""Partitions an .txt documents into its constituent elements.
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
A string defining the target filename path.
|
||||
file
|
||||
A file-like object using "r" mode --> open(filename, "r").
|
||||
text
|
||||
The string representation of the .txt document.
|
||||
"""
|
||||
|
||||
if not any([filename, file, text]):
|
||||
raise ValueError("One of filename, file, or text must be specified.")
|
||||
|
||||
if filename is not None and not file and not text:
|
||||
with open(filename, "r") as f:
|
||||
file_text = f.read()
|
||||
|
||||
elif file is not None and not filename and not text:
|
||||
file_text = file.read()
|
||||
|
||||
elif text is not None and not filename and not file:
|
||||
file_text = str(text)
|
||||
|
||||
else:
|
||||
raise ValueError("Only one of filename, file, or text can be specified.")
|
||||
|
||||
file_content = split_by_paragraph(file_text)
|
||||
|
||||
elements: List[Element] = list()
|
||||
for ctext in file_content:
|
||||
|
||||
ctext = ctext.strip()
|
||||
|
||||
if ctext == "":
|
||||
break
|
||||
if is_bulleted_text(ctext):
|
||||
elements.append(ListItem(text=clean_bullets(ctext)))
|
||||
elif is_possible_narrative_text(ctext):
|
||||
elements.append(NarrativeText(text=ctext))
|
||||
elif is_possible_title(ctext):
|
||||
elements.append(Title(text=ctext))
|
||||
return elements
|
||||
Loading…
x
Reference in New Issue
Block a user