mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-05 20:37:36 +00:00
feat: Add new functionality to parse text and header of emails (#111)
* partition_text function
This commit is contained in:
parent
7fb8713527
commit
d7a00046a9
@ -1,10 +1,14 @@
|
|||||||
## 0.3.6-dev1
|
## 0.3.6-dev2
|
||||||
|
|
||||||
* Cleaning brick for removing ordered bullets `clean_ordered_bullets`.
|
* Cleaning brick for removing ordered bullets `clean_ordered_bullets`.
|
||||||
* Extract brick method for ordered bullets `extract_ordered_bullets`.
|
* Extract brick method for ordered bullets `extract_ordered_bullets`.
|
||||||
* Test for `clean_ordered_bullets`.
|
* Test for `clean_ordered_bullets`.
|
||||||
* Test for `extract_ordered_bullets`.
|
* Test for `extract_ordered_bullets`.
|
||||||
* Added `partition_docx` for pre-processing Word Documents.
|
* Added `partition_docx` for pre-processing Word Documents.
|
||||||
|
* Added new REGEX patterns to extract email header information
|
||||||
|
* Added new functions to extract header information `parse_received_data` and `partition_header`
|
||||||
|
* Added new function to parse plain text files `partition_text`
|
||||||
|
* Added new cleaners functions `extract_ip_address`, `extract_ip_address_name`, `extract_mapi_id`, `extract_datetimetz`
|
||||||
|
|
||||||
## 0.3.5
|
## 0.3.5
|
||||||
|
|
||||||
@ -18,6 +22,7 @@
|
|||||||
* Add new function `extract_attachment_info` that extracts and decode the attachment
|
* Add new function `extract_attachment_info` that extracts and decode the attachment
|
||||||
of an email.
|
of an email.
|
||||||
* Staging brick to convert a list of `Element`s to a `pandas` dataframe.
|
* Staging brick to convert a list of `Element`s to a `pandas` dataframe.
|
||||||
|
* Add plain text functionality to `partition_email`
|
||||||
|
|
||||||
## 0.3.4
|
## 0.3.4
|
||||||
|
|
||||||
|
|||||||
45
README.md
45
README.md
@ -190,6 +190,51 @@ Roses are red
|
|||||||
Violets are blue
|
Violets are blue
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Text Document Parsing
|
||||||
|
|
||||||
|
The `partition_text` function within `unstructured` can be used to parse simple
|
||||||
|
text files into elements.
|
||||||
|
|
||||||
|
`partition_text` accepts filenames, file-like object, and raw text as input. The following three snippets are for parsing text files:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from unstructured.partition.text import partition_text
|
||||||
|
|
||||||
|
elements = partition_text(filename="example-docs/fake-text.txt")
|
||||||
|
|
||||||
|
with open("example-docs/fake-text.txt", "r") as f:
|
||||||
|
elements = partition_text(file=f)
|
||||||
|
|
||||||
|
with open("example-docs/fake-text.txt", "r") as f:
|
||||||
|
text = f.read()
|
||||||
|
elements = partition_text(text=text)
|
||||||
|
```
|
||||||
|
|
||||||
|
The `elements` output will look like the following:
|
||||||
|
|
||||||
|
```python
|
||||||
|
[<unstructured.documents.html.HTMLNarrativeText at 0x13ab14370>,
|
||||||
|
<unstructured.documents.html.HTMLTitle at 0x106877970>,
|
||||||
|
<unstructured.documents.html.HTMLListItem at 0x1068776a0>,
|
||||||
|
<unstructured.documents.html.HTMLListItem at 0x13fe4b0a0>]
|
||||||
|
```
|
||||||
|
|
||||||
|
Run `print("\n\n".join([str(el) for el in elements]))` to get a string representation of the
|
||||||
|
output, which looks like:
|
||||||
|
|
||||||
|
```python
|
||||||
|
This is a test document to use for unit tests.
|
||||||
|
|
||||||
|
Important points:
|
||||||
|
|
||||||
|
Hamburgers are delicious
|
||||||
|
|
||||||
|
Dogs are the best
|
||||||
|
|
||||||
|
I love fuzzy blankets
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
## :guardsman: Security Policy
|
## :guardsman: Security Policy
|
||||||
|
|
||||||
See our [security policy](https://github.com/Unstructured-IO/unstructured/security/policy) for
|
See our [security policy](https://github.com/Unstructured-IO/unstructured/security/policy) for
|
||||||
|
|||||||
@ -90,7 +90,11 @@ Examples:
|
|||||||
The ``partition_email`` function partitions ``.eml`` documents and works with exports
|
The ``partition_email`` function partitions ``.eml`` documents and works with exports
|
||||||
from email clients such as Microsoft Outlook and Gmail. The ``partition_email``
|
from email clients such as Microsoft Outlook and Gmail. The ``partition_email``
|
||||||
takes a filename, file-like object, or raw text as input and produces a list of
|
takes a filename, file-like object, or raw text as input and produces a list of
|
||||||
document ``Element`` objects as output.
|
document ``Element`` objects as output. Also ``content_source`` can be set to ``text/html``
|
||||||
|
(default) or ``text/plain`` to process the html or plain text version of the email, respectively.
|
||||||
|
In order for ``partition_email`` to also return the header information (e.g. sender, recipient,
|
||||||
|
attachment, etc.), ``include_headers`` must be set to ``True``. Returns tuple with body elements
|
||||||
|
first and header elements second, if ``include_headers`` is True.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
@ -107,6 +111,37 @@ Examples:
|
|||||||
text = f.read()
|
text = f.read()
|
||||||
elements = partition_email(text=text)
|
elements = partition_email(text=text)
|
||||||
|
|
||||||
|
with open("example-docs/fake-email.eml", "r") as f:
|
||||||
|
text = f.read()
|
||||||
|
elements = partition_email(text=text, content_source="text/plain")
|
||||||
|
|
||||||
|
with open("example-docs/fake-email.eml", "r") as f:
|
||||||
|
text = f.read()
|
||||||
|
elements = partition_email(text=text, include_headers=True)
|
||||||
|
|
||||||
|
|
||||||
|
``partition_text``
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
The ``partition_text`` function partitions text files. The ``partition_text``
|
||||||
|
takes a filename, file-like object, and raw text as input and produces ``Element`` objects as output.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.partition.text import partition_text
|
||||||
|
|
||||||
|
elements = partition_text(filename="example-docs/fake-text.txt")
|
||||||
|
|
||||||
|
with open("example-docs/fake-text.txt", "r") as f:
|
||||||
|
elements = partition_text(file=f)
|
||||||
|
|
||||||
|
with open("example-docs/fake-text.txt", "r") as f:
|
||||||
|
text = f.read()
|
||||||
|
elements = partition_text(text=text)
|
||||||
|
|
||||||
|
|
||||||
``extract_attachment_info``
|
``extract_attachment_info``
|
||||||
----------------------------
|
----------------------------
|
||||||
|
|
||||||
@ -550,6 +585,96 @@ Examples:
|
|||||||
# Returns "Look at me, I'm flying!"
|
# Returns "Look at me, I'm flying!"
|
||||||
extract_text_after(text, r"SPEAKER \d{1}:")
|
extract_text_after(text, r"SPEAKER \d{1}:")
|
||||||
|
|
||||||
|
``extract_email_address``
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
Extracts email addresses from a string input and returns a list of all the email
|
||||||
|
addresses in the input string.
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.cleaners.extract import extract_email_address
|
||||||
|
|
||||||
|
text = """Me me@email.com and You <You@email.com>
|
||||||
|
([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""
|
||||||
|
|
||||||
|
# Returns "['me@email.com', 'you@email.com']"
|
||||||
|
extract_email_address(text)
|
||||||
|
|
||||||
|
|
||||||
|
``extract_ip_address``
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
Extracts IPv4 and IPv6 IP addresses in the input string and
|
||||||
|
returns a list of all IP address in input string.
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.cleaners.extract import extract_ip_address
|
||||||
|
|
||||||
|
text = """Me me@email.com and You <You@email.com>
|
||||||
|
([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""
|
||||||
|
|
||||||
|
# Returns "['ba23::58b5:2236:45g2:88h2', '10.0.2.01']"
|
||||||
|
extract_ip_address(text)
|
||||||
|
|
||||||
|
|
||||||
|
``extract_ip_address_name``
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
Extracts the names of each IP address in the ``Received`` field(s) from an ``.eml``
|
||||||
|
file. ``extract_ip_address_name`` takes in a string and returns a list of all
|
||||||
|
IP addresses in the input string.
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.cleaners.extract import extract_ip_address_name
|
||||||
|
|
||||||
|
text = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by
|
||||||
|
\n ABC.DEF.local2 ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\
|
||||||
|
n 32.88.5467.123; Fri, 26 Mar 2021 11:04:09 +1200"""
|
||||||
|
|
||||||
|
# Returns "['ABC.DEF.local', 'ABC.DEF.local2']"
|
||||||
|
extract_ip_address_name(text)
|
||||||
|
|
||||||
|
|
||||||
|
``extract_mapi_id``
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
Extracts the ``mapi id`` in the ``Received`` field(s) from an ``.eml``
|
||||||
|
file. ``extract_mapi_id`` takes in a string and returns a list of a string
|
||||||
|
containing the ``mapi id`` in the input string.
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.cleaners.extract import extract_mapi_id
|
||||||
|
|
||||||
|
text = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by
|
||||||
|
\n ABC.DEF.local2 ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\
|
||||||
|
n 32.88.5467.123; Fri, 26 Mar 2021 11:04:09 +1200"""
|
||||||
|
|
||||||
|
# Returns "['32.88.5467.123']"
|
||||||
|
extract_mapi_id(text)
|
||||||
|
|
||||||
|
|
||||||
|
``extract_datetimetz``
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
Extracts the date, time, and timezone in the ``Received`` field(s) from an ``.eml``
|
||||||
|
file. ``extract_datetimetz`` takes in a string and returns a datetime.datetime
|
||||||
|
object from the input string.
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.cleaners.extract import extract_datetimetz
|
||||||
|
|
||||||
|
text = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by
|
||||||
|
\n ABC.DEF.local2 ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\
|
||||||
|
n 32.88.5467.123; Fri, 26 Mar 2021 11:04:09 +1200"""
|
||||||
|
|
||||||
|
# Returns datetime.datetime(2021, 3, 26, 11, 4, 9, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)))
|
||||||
|
extract_datetimetz(text)
|
||||||
|
|
||||||
|
|
||||||
``extract_us_phone_number``
|
``extract_us_phone_number``
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|||||||
24
example-docs/fake-email.txt
Normal file
24
example-docs/fake-email.txt
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
MIME-Version: 1.0
|
||||||
|
Date: Fri, 16 Dec 2022 17:04:16 -0500
|
||||||
|
Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
|
||||||
|
Subject: Test Email
|
||||||
|
From: Matthew Robinson <mrobinson@unstructured.io>
|
||||||
|
To: Matthew Robinson <mrobinson@unstructured.io>
|
||||||
|
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
|
||||||
|
|
||||||
|
--00000000000095c9b205eff92630
|
||||||
|
Content-Type: text/plain; charset="UTF-8"
|
||||||
|
|
||||||
|
This is a test email to use for unit tests.
|
||||||
|
|
||||||
|
Important points:
|
||||||
|
|
||||||
|
- Roses are red
|
||||||
|
- Violets are blue
|
||||||
|
|
||||||
|
--00000000000095c9b205eff92630
|
||||||
|
Content-Type: text/html; charset="UTF-8"
|
||||||
|
|
||||||
|
<div dir="ltr"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div>
|
||||||
|
|
||||||
|
--00000000000095c9b205eff92630--
|
||||||
7
example-docs/fake-text.txt
Normal file
7
example-docs/fake-text.txt
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
This is a test document to use for unit tests.
|
||||||
|
|
||||||
|
Important points:
|
||||||
|
|
||||||
|
- Hamburgers are delicious
|
||||||
|
- Dogs are the best
|
||||||
|
- I love fuzzy blankets
|
||||||
66
requirements.txt
Normal file
66
requirements.txt
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
#
|
||||||
|
# This file is autogenerated by pip-compile with python 3.9
|
||||||
|
# To update, run:
|
||||||
|
#
|
||||||
|
# pip-compile
|
||||||
|
#
|
||||||
|
argilla==1.1.1
|
||||||
|
# via unstructured (setup.py)
|
||||||
|
backoff==2.2.1
|
||||||
|
# via argilla
|
||||||
|
certifi==2022.12.7
|
||||||
|
# via httpx
|
||||||
|
click==8.1.3
|
||||||
|
# via nltk
|
||||||
|
deprecated==1.2.13
|
||||||
|
# via argilla
|
||||||
|
h11==0.9.0
|
||||||
|
# via httpcore
|
||||||
|
httpcore==0.11.1
|
||||||
|
# via httpx
|
||||||
|
httpx==0.15.5
|
||||||
|
# via argilla
|
||||||
|
idna==3.4
|
||||||
|
# via rfc3986
|
||||||
|
joblib==1.2.0
|
||||||
|
# via nltk
|
||||||
|
lxml==4.9.2
|
||||||
|
# via unstructured (setup.py)
|
||||||
|
monotonic==1.6
|
||||||
|
# via argilla
|
||||||
|
nltk==3.8
|
||||||
|
# via unstructured (setup.py)
|
||||||
|
numpy==1.23.5
|
||||||
|
# via
|
||||||
|
# argilla
|
||||||
|
# pandas
|
||||||
|
packaging==22.0
|
||||||
|
# via argilla
|
||||||
|
pandas==1.5.2
|
||||||
|
# via argilla
|
||||||
|
pydantic==1.10.2
|
||||||
|
# via argilla
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
# via pandas
|
||||||
|
pytz==2022.6
|
||||||
|
# via pandas
|
||||||
|
regex==2022.10.31
|
||||||
|
# via nltk
|
||||||
|
rfc3986[idna2008]==1.5.0
|
||||||
|
# via httpx
|
||||||
|
six==1.16.0
|
||||||
|
# via python-dateutil
|
||||||
|
sniffio==1.3.0
|
||||||
|
# via
|
||||||
|
# httpcore
|
||||||
|
# httpx
|
||||||
|
tqdm==4.64.1
|
||||||
|
# via
|
||||||
|
# argilla
|
||||||
|
# nltk
|
||||||
|
typing-extensions==4.4.0
|
||||||
|
# via pydantic
|
||||||
|
wrapt==1.13.3
|
||||||
|
# via
|
||||||
|
# argilla
|
||||||
|
# deprecated
|
||||||
@ -1,7 +1,12 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
import datetime
|
||||||
|
|
||||||
import unstructured.cleaners.extract as extract
|
import unstructured.cleaners.extract as extract
|
||||||
|
|
||||||
|
EMAIL_META_DATA_INPUT = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by
|
||||||
|
\n ABC.DEF.local ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\
|
||||||
|
n 32.88.5467.123; Fri, 26 Mar 2021 11:04:09 +1200"""
|
||||||
|
|
||||||
|
|
||||||
def test_get_indexed_match_raises_with_bad_index():
|
def test_get_indexed_match_raises_with_bad_index():
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
@ -23,6 +28,35 @@ def test_extract_text_after():
|
|||||||
assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!"
|
assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_email_address():
|
||||||
|
text = "Im Rabn <Im.Rabn@npf.gov.nr>"
|
||||||
|
assert extract.extract_email_address(text) == ["im.rabn@npf.gov.nr"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_ip_address():
|
||||||
|
assert extract.extract_ip_address(EMAIL_META_DATA_INPUT) == [
|
||||||
|
"ba23::58b5:2236:45g2:88h2",
|
||||||
|
"ba23::58b5:2236:45g2:88h2%25",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_ip_address_name():
|
||||||
|
assert extract.extract_ip_address_name(EMAIL_META_DATA_INPUT) == [
|
||||||
|
"ABC.DEF.local",
|
||||||
|
"ABC.DEF.local",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_mapi_id():
|
||||||
|
assert extract.extract_mapi_id(EMAIL_META_DATA_INPUT) == ["32.88.5467.123"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_datetimetz():
|
||||||
|
assert extract.extract_datetimetz(EMAIL_META_DATA_INPUT) == datetime.datetime(
|
||||||
|
2021, 3, 26, 11, 4, 9, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text, expected",
|
"text, expected",
|
||||||
[
|
[
|
||||||
|
|||||||
@ -4,7 +4,17 @@ import pathlib
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from unstructured.documents.elements import NarrativeText, Title, ListItem
|
from unstructured.documents.elements import NarrativeText, Title, ListItem
|
||||||
from unstructured.partition.email import partition_email, extract_attachment_info
|
from unstructured.documents.email_elements import (
|
||||||
|
MetaData,
|
||||||
|
Recipient,
|
||||||
|
Sender,
|
||||||
|
Subject,
|
||||||
|
)
|
||||||
|
from unstructured.partition.email import (
|
||||||
|
extract_attachment_info,
|
||||||
|
partition_email,
|
||||||
|
partition_email_header,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
@ -17,6 +27,23 @@ EXPECTED_OUTPUT = [
|
|||||||
ListItem(text="Violets are blue"),
|
ListItem(text="Violets are blue"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
HEADER_EXPECTED_OUTPUT = [
|
||||||
|
MetaData(name="MIME-Version", text="1.0"),
|
||||||
|
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
|
||||||
|
MetaData(
|
||||||
|
name="Message-ID",
|
||||||
|
text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
|
||||||
|
),
|
||||||
|
Subject(text="Test Email"),
|
||||||
|
Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
|
||||||
|
Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
|
||||||
|
MetaData(
|
||||||
|
name="Content-Type", text='multipart/alternative; boundary="00000000000095c9b205eff92630"'
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
ALL_EXPECTED_OUTPUT = HEADER_EXPECTED_OUTPUT + EXPECTED_OUTPUT
|
||||||
|
|
||||||
ATTACH_EXPECTED_OUTPUT = [
|
ATTACH_EXPECTED_OUTPUT = [
|
||||||
{"filename": "fake-attachment.txt", "payload": b"Hey this is a fake attachment!"}
|
{"filename": "fake-attachment.txt", "payload": b"Hey this is a fake attachment!"}
|
||||||
]
|
]
|
||||||
@ -37,6 +64,22 @@ def test_partition_email_from_file():
|
|||||||
assert elements == EXPECTED_OUTPUT
|
assert elements == EXPECTED_OUTPUT
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_email_from_text_file():
|
||||||
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.txt")
|
||||||
|
with open(filename, "r") as f:
|
||||||
|
elements = partition_email(file=f, content_source="text/plain")
|
||||||
|
assert len(elements) > 0
|
||||||
|
assert elements == EXPECTED_OUTPUT
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_email_from_text_file_with_headers():
|
||||||
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.txt")
|
||||||
|
with open(filename, "r") as f:
|
||||||
|
elements = partition_email(file=f, content_source="text/plain", include_headers=True)
|
||||||
|
assert len(elements) > 0
|
||||||
|
assert elements == ALL_EXPECTED_OUTPUT
|
||||||
|
|
||||||
|
|
||||||
def test_partition_email_from_text():
|
def test_partition_email_from_text():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
@ -46,6 +89,15 @@ def test_partition_email_from_text():
|
|||||||
assert elements == EXPECTED_OUTPUT
|
assert elements == EXPECTED_OUTPUT
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_email_header():
|
||||||
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
|
||||||
|
with open(filename, "r") as f:
|
||||||
|
msg = email.message_from_file(f)
|
||||||
|
elements = partition_email_header(msg)
|
||||||
|
assert len(elements) > 0
|
||||||
|
assert elements == HEADER_EXPECTED_OUTPUT
|
||||||
|
|
||||||
|
|
||||||
def test_extract_attachment_info():
|
def test_extract_attachment_info():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
|
|||||||
54
test_unstructured/partition/test_text.py
Normal file
54
test_unstructured/partition/test_text.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from unstructured.documents.elements import NarrativeText, Title, ListItem
|
||||||
|
from unstructured.partition.text import partition_text
|
||||||
|
|
||||||
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
|
|
||||||
|
EXPECTED_OUTPUT = [
|
||||||
|
NarrativeText(text="This is a test document to use for unit tests."),
|
||||||
|
Title(text="Important points:"),
|
||||||
|
ListItem(text="Hamburgers are delicious"),
|
||||||
|
ListItem(text="Dogs are the best"),
|
||||||
|
ListItem(text="I love fuzzy blankets"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_email_from_filename():
|
||||||
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||||
|
elements = partition_text(filename=filename)
|
||||||
|
assert len(elements) > 0
|
||||||
|
assert elements == EXPECTED_OUTPUT
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_email_from_file():
|
||||||
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||||
|
with open(filename, "r") as f:
|
||||||
|
elements = partition_text(file=f)
|
||||||
|
assert len(elements) > 0
|
||||||
|
assert elements == EXPECTED_OUTPUT
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_email_from_text():
|
||||||
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||||
|
with open(filename, "r") as f:
|
||||||
|
text = f.read()
|
||||||
|
elements = partition_text(text=text)
|
||||||
|
assert len(elements) > 0
|
||||||
|
assert elements == EXPECTED_OUTPUT
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_email_raises_with_none_specified():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition_text()
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_email_raises_with_too_many_specified():
|
||||||
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-text.txt")
|
||||||
|
with open(filename, "r") as f:
|
||||||
|
text = f.read()
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
partition_text(filename=filename, text=text)
|
||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.3.6-dev1" # pragma: no cover
|
__version__ = "0.3.6-dev2" # pragma: no cover
|
||||||
|
|||||||
@ -1,4 +1,13 @@
|
|||||||
import re
|
import re
|
||||||
|
import datetime
|
||||||
|
from typing import List
|
||||||
|
from unstructured.nlp.patterns import (
|
||||||
|
IP_ADDRESS_PATTERN_RE,
|
||||||
|
IP_ADDRESS_NAME_PATTERN,
|
||||||
|
MAPI_ID_PATTERN,
|
||||||
|
EMAIL_DATETIMETZ_PATTERN,
|
||||||
|
EMAIL_ADDRESS_PATTERN,
|
||||||
|
)
|
||||||
|
|
||||||
from unstructured.nlp.patterns import US_PHONE_NUMBERS_RE
|
from unstructured.nlp.patterns import US_PHONE_NUMBERS_RE
|
||||||
|
|
||||||
@ -48,6 +57,29 @@ def extract_text_after(text: str, pattern: str, index: int = 0, strip: bool = Tr
|
|||||||
return before_text.lstrip() if strip else before_text
|
return before_text.lstrip() if strip else before_text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_email_address(text: str) -> List[str]:
|
||||||
|
return re.findall(EMAIL_ADDRESS_PATTERN, text.lower())
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ip_address(text: str) -> List[str]:
|
||||||
|
return re.findall(IP_ADDRESS_PATTERN_RE, text)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ip_address_name(text: str) -> List[str]:
|
||||||
|
return re.findall(IP_ADDRESS_NAME_PATTERN, text)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_mapi_id(text: str) -> List[str]:
|
||||||
|
mapi_ids = re.findall(MAPI_ID_PATTERN, text)
|
||||||
|
mapi_ids = [mid.replace(";", "") for mid in mapi_ids]
|
||||||
|
return mapi_ids
|
||||||
|
|
||||||
|
|
||||||
|
def extract_datetimetz(text: str) -> datetime.datetime:
|
||||||
|
date_string = re.findall(EMAIL_DATETIMETZ_PATTERN, text)
|
||||||
|
return datetime.datetime.strptime(date_string[0], "%a, %d %b %Y %H:%M:%S %z")
|
||||||
|
|
||||||
|
|
||||||
def extract_us_phone_number(text: str):
|
def extract_us_phone_number(text: str):
|
||||||
"""Extracts a US phone number from a section of text that includes a phone number. If there
|
"""Extracts a US phone number from a section of text that includes a phone number. If there
|
||||||
is no phone number present, the result will be an empty string.
|
is no phone number present, the result will be an empty string.
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
from abc import ABC
|
from abc import ABC
|
||||||
|
from datetime import datetime
|
||||||
import hashlib
|
import hashlib
|
||||||
from typing import Callable, List, Union
|
from typing import Callable, List, Union
|
||||||
from unstructured.documents.elements import Element, Text, NoID
|
from unstructured.documents.elements import Element, Text, NoID
|
||||||
@ -15,9 +16,16 @@ class Name(EmailElement):
|
|||||||
|
|
||||||
category = "Uncategorized"
|
category = "Uncategorized"
|
||||||
|
|
||||||
def __init__(self, name: str, text: str, element_id: Union[str, NoID] = NoID()):
|
def __init__(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
text: str,
|
||||||
|
element_id: Union[str, NoID] = NoID(),
|
||||||
|
):
|
||||||
self.name: str = name
|
self.name: str = name
|
||||||
self.text: str = text
|
self.text: str = text
|
||||||
|
self.datestamp: datetime
|
||||||
|
self.has_datestamp: bool = False
|
||||||
|
|
||||||
if isinstance(element_id, NoID):
|
if isinstance(element_id, NoID):
|
||||||
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
|
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
|
||||||
@ -25,10 +33,20 @@ class Name(EmailElement):
|
|||||||
|
|
||||||
super().__init__(element_id=element_id)
|
super().__init__(element_id=element_id)
|
||||||
|
|
||||||
|
def set_datestamp(self, datestamp: datetime):
|
||||||
|
self.datestamp = datestamp
|
||||||
|
self.has_datestamp = True
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"{self.name}: {self.text}"
|
return f"{self.name}: {self.text}"
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
|
if self.has_datestamp:
|
||||||
|
return (
|
||||||
|
self.name == other.name
|
||||||
|
and self.text == other.text
|
||||||
|
and self.datestamp == other.datestamp
|
||||||
|
)
|
||||||
return self.name == other.name and self.text == other.text
|
return self.name == other.name and self.text == other.text
|
||||||
|
|
||||||
def apply(self, *cleaners: Callable):
|
def apply(self, *cleaners: Callable):
|
||||||
@ -60,54 +78,50 @@ class BodyText(List[Text]):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Recipient(Text):
|
class Recipient(Name):
|
||||||
"""A text element for capturing the recipient information of an email (e.g. Subject,
|
"""A text element for capturing the recipient information of an email"""
|
||||||
To, From, etc)."""
|
|
||||||
|
|
||||||
category = "Recipient"
|
category = "Recipient"
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Sender(Text):
|
class Sender(Name):
|
||||||
"""A text element for capturing the sender information of an email (e.g. Subject,
|
"""A text element for capturing the sender information of an email"""
|
||||||
To, From, etc)."""
|
|
||||||
|
|
||||||
category = "Sender"
|
category = "Sender"
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Subject(Text):
|
class Subject(Text, EmailElement):
|
||||||
"""A text element for capturing the subject information of an email (e.g. Subject,
|
"""A text element for capturing the subject information of an email"""
|
||||||
To, From, etc)."""
|
|
||||||
|
|
||||||
category = "Subject"
|
category = "Subject"
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ReceivedInfo(List[Text]):
|
|
||||||
"""A text element for capturing header information of an email (e.g. Subject,
|
|
||||||
To, From, etc)."""
|
|
||||||
|
|
||||||
category = "ReceivedInfo"
|
|
||||||
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class MetaData(Name):
|
class MetaData(Name):
|
||||||
"""A text element for capturing header meta data of an email (e.g. Subject,
|
"""A text element for capturing header meta data of an email
|
||||||
To, From, etc)."""
|
(miscellaneous data in the email)."""
|
||||||
|
|
||||||
category = "MetaData"
|
category = "MetaData"
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ReceivedInfo(Name):
|
||||||
|
"""A text element for capturing header information of an email (e.g. IP addresses, etc)."""
|
||||||
|
|
||||||
|
category = "ReceivedInfo"
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Attachment(Name):
|
class Attachment(Name):
|
||||||
"""A text element for capturing the attachment name in an email (e.g. Subject,
|
"""A text element for capturing the attachment name in an email (e.g. documents,
|
||||||
To, From, etc)."""
|
images, etc)."""
|
||||||
|
|
||||||
category = "Attachment"
|
category = "Attachment"
|
||||||
|
|
||||||
@ -117,11 +131,11 @@ class Attachment(Name):
|
|||||||
class Email(ABC):
|
class Email(ABC):
|
||||||
"""An email class with it's attributes"""
|
"""An email class with it's attributes"""
|
||||||
|
|
||||||
def __init__(self, recipient: Recipient, sender: Sender, subject: Subject, body: BodyText):
|
def __init__(self):
|
||||||
self.recipient = recipient
|
self.recipient = Recipient
|
||||||
self.sender = sender
|
self.sender = Sender
|
||||||
self.subject = subject
|
self.subject = Subject
|
||||||
self.body = body
|
self.body = BodyText
|
||||||
self.received_info: ReceivedInfo
|
self.received_info: ReceivedInfo
|
||||||
self.meta_data: MetaData
|
self.meta_data: MetaData
|
||||||
self.attachment: List[Attachment]
|
self.attachment: List[Attachment]
|
||||||
|
|||||||
@ -41,3 +41,28 @@ UNICODE_BULLETS: Final[List[str]] = [
|
|||||||
"·",
|
"·",
|
||||||
]
|
]
|
||||||
UNICODE_BULLETS_RE = re.compile(f"({'|'.join(UNICODE_BULLETS)})")
|
UNICODE_BULLETS_RE = re.compile(f"({'|'.join(UNICODE_BULLETS)})")
|
||||||
|
|
||||||
|
# Helps split text by paragraphs
|
||||||
|
PARAGRAPH_PATTERN = "\n\n\n|\n\n|\r\n|\r|\n" # noqa: W605 NOTE(harrell)
|
||||||
|
|
||||||
|
# IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01
|
||||||
|
IP_ADDRESS_PATTERN = (
|
||||||
|
"[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}", # noqa: W605 NOTE(harrell)
|
||||||
|
# - skipping qa because we need the escape for the regex
|
||||||
|
"[a-z0-9]{4}::[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}%?[0-9]*",
|
||||||
|
)
|
||||||
|
IP_ADDRESS_PATTERN_RE = re.compile(f"({'|'.join(IP_ADDRESS_PATTERN)})")
|
||||||
|
|
||||||
|
IP_ADDRESS_NAME_PATTERN = "[a-zA-Z0-9-]*\.[a-zA-Z]*\.[a-zA-Z]*" # noqa: W605 NOTE(harrell)
|
||||||
|
# - skipping qa because we need the escape for the regex
|
||||||
|
|
||||||
|
# Mapi ID example: 32.88.5467.123
|
||||||
|
MAPI_ID_PATTERN = "[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*;" # noqa: W605 NOTE(harrell)
|
||||||
|
# - skipping qa because we need the escape for the regex
|
||||||
|
|
||||||
|
# Date, time, timezone example: Fri, 26 Mar 2021 11:04:09 +1200
|
||||||
|
EMAIL_DATETIMETZ_PATTERN = "[a-zA-z]{3},\s[0-9]{2}\s[a-zA-Z]{3}\s[0-9]{4}\s[0-9]{2}:[0-9]{2}:[0-9]{2}\s[+0-9]{5}" # noqa: W605,E501
|
||||||
|
# NOTE(harrell) - skipping qa because we need the escape for the regex
|
||||||
|
|
||||||
|
EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell)
|
||||||
|
# - skipping qa because we need the escape for the regex
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
import email
|
import email
|
||||||
import sys
|
import sys
|
||||||
|
import re
|
||||||
from email.message import Message
|
from email.message import Message
|
||||||
from typing import Dict, IO, List, Optional
|
from typing import Dict, IO, List, Optional, Tuple
|
||||||
|
|
||||||
if sys.version_info < (3, 8):
|
if sys.version_info < (3, 8):
|
||||||
from typing_extensions import Final
|
from typing_extensions import Final
|
||||||
@ -9,11 +10,77 @@ else:
|
|||||||
from typing import Final
|
from typing import Final
|
||||||
|
|
||||||
from unstructured.cleaners.core import replace_mime_encodings, clean_extra_whitespace
|
from unstructured.cleaners.core import replace_mime_encodings, clean_extra_whitespace
|
||||||
|
from unstructured.cleaners.extract import (
|
||||||
|
extract_ip_address,
|
||||||
|
extract_ip_address_name,
|
||||||
|
extract_mapi_id,
|
||||||
|
extract_datetimetz,
|
||||||
|
extract_email_address,
|
||||||
|
)
|
||||||
|
from unstructured.documents.email_elements import (
|
||||||
|
Recipient,
|
||||||
|
Sender,
|
||||||
|
Subject,
|
||||||
|
ReceivedInfo,
|
||||||
|
MetaData,
|
||||||
|
)
|
||||||
from unstructured.documents.elements import Element, Text
|
from unstructured.documents.elements import Element, Text
|
||||||
from unstructured.partition.html import partition_html
|
from unstructured.partition.html import partition_html
|
||||||
|
from unstructured.partition.text import split_by_paragraph, partition_text
|
||||||
|
|
||||||
|
|
||||||
VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html"]
|
VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"]
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_received_data(data: str) -> List[Element]:
|
||||||
|
|
||||||
|
ip_address_names = extract_ip_address_name(data)
|
||||||
|
ip_addresses = extract_ip_address(data)
|
||||||
|
mapi_id = extract_mapi_id(data)
|
||||||
|
datetimetz = extract_datetimetz(data)
|
||||||
|
|
||||||
|
elements: List[Element] = list()
|
||||||
|
if ip_address_names and ip_addresses:
|
||||||
|
for name, ip in zip(ip_address_names, ip_addresses):
|
||||||
|
elements.append(ReceivedInfo(name=name, text=ip))
|
||||||
|
if mapi_id:
|
||||||
|
elements.append(ReceivedInfo(name="mapi_id", text=mapi_id[0]))
|
||||||
|
if datetimetz:
|
||||||
|
elements.append(
|
||||||
|
ReceivedInfo(name="received_datetimetz", text=str(datetimetz)).set_datestamp(
|
||||||
|
datestamp=datetimetz
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_email_address(data: str) -> Tuple[str, str]:
|
||||||
|
email_address = extract_email_address(data)
|
||||||
|
|
||||||
|
PATTERN = "<[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+>" # noqa: W605 Note(harrell)
|
||||||
|
name = re.split(PATTERN, data.lower())[0].title().strip()
|
||||||
|
|
||||||
|
return name, email_address[0]
|
||||||
|
|
||||||
|
|
||||||
|
def partition_email_header(msg: Message) -> List[Element]:
|
||||||
|
elements: List[Element] = list()
|
||||||
|
for item in msg.raw_items():
|
||||||
|
if item[0] == "To":
|
||||||
|
text = _parse_email_address(item[1])
|
||||||
|
elements.append(Recipient(name=text[0], text=text[1]))
|
||||||
|
elif item[0] == "From":
|
||||||
|
text = _parse_email_address(item[1])
|
||||||
|
elements.append(Sender(name=text[0], text=text[1]))
|
||||||
|
elif item[0] == "Subject":
|
||||||
|
elements.append(Subject(text=item[1]))
|
||||||
|
elif item[0] == "Received":
|
||||||
|
elements += _parse_received_data(item[1])
|
||||||
|
else:
|
||||||
|
elements.append(MetaData(name=item[0], text=item[1]))
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
|
||||||
def extract_attachment_info(
|
def extract_attachment_info(
|
||||||
@ -40,7 +107,7 @@ def extract_attachment_info(
|
|||||||
if output_dir:
|
if output_dir:
|
||||||
filename = output_dir + "/" + attachment["filename"]
|
filename = output_dir + "/" + attachment["filename"]
|
||||||
with open(filename, "wb") as f:
|
with open(filename, "wb") as f:
|
||||||
# mypy wants to just us `w` when opening the file but this
|
# Note(harrell) mypy wants to just us `w` when opening the file but this
|
||||||
# causes an error since the payloads are bytes not str
|
# causes an error since the payloads are bytes not str
|
||||||
f.write(attachment["payload"]) # type: ignore
|
f.write(attachment["payload"]) # type: ignore
|
||||||
return list_attachments
|
return list_attachments
|
||||||
@ -51,6 +118,7 @@ def partition_email(
|
|||||||
file: Optional[IO] = None,
|
file: Optional[IO] = None,
|
||||||
text: Optional[str] = None,
|
text: Optional[str] = None,
|
||||||
content_source: str = "text/html",
|
content_source: str = "text/html",
|
||||||
|
include_headers: bool = False,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Partitions an .eml documents into its constituent elements.
|
"""Partitions an .eml documents into its constituent elements.
|
||||||
Parameters
|
Parameters
|
||||||
@ -61,6 +129,9 @@ def partition_email(
|
|||||||
A file-like object using "r" mode --> open(filename, "r").
|
A file-like object using "r" mode --> open(filename, "r").
|
||||||
text
|
text
|
||||||
The string representation of the .eml document.
|
The string representation of the .eml document.
|
||||||
|
content_source
|
||||||
|
default: "text/html"
|
||||||
|
other: "text/plain"
|
||||||
"""
|
"""
|
||||||
if content_source not in VALID_CONTENT_SOURCES:
|
if content_source not in VALID_CONTENT_SOURCES:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -92,7 +163,7 @@ def partition_email(
|
|||||||
|
|
||||||
content = content_map.get(content_source, "")
|
content = content_map.get(content_source, "")
|
||||||
if not content:
|
if not content:
|
||||||
raise ValueError("text/html content not found in email")
|
raise ValueError(f"{content_source} content not found in email")
|
||||||
|
|
||||||
# NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
|
# NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
|
||||||
# looks like the following, resulting in extraneous "=" chracters in the output if
|
# looks like the following, resulting in extraneous "=" chracters in the output if
|
||||||
@ -101,11 +172,19 @@ def partition_email(
|
|||||||
# <li>Item 1</li>=
|
# <li>Item 1</li>=
|
||||||
# <li>Item 2<li>=
|
# <li>Item 2<li>=
|
||||||
# </ul>
|
# </ul>
|
||||||
content = "".join(content.split("=\n"))
|
list_content = split_by_paragraph(content)
|
||||||
|
|
||||||
|
if content_source == "text/html":
|
||||||
|
content = "".join(list_content)
|
||||||
elements = partition_html(text=content)
|
elements = partition_html(text=content)
|
||||||
for element in elements:
|
for element in elements:
|
||||||
if isinstance(element, Text):
|
if isinstance(element, Text):
|
||||||
element.apply(replace_mime_encodings)
|
element.apply(replace_mime_encodings)
|
||||||
|
elif content_source == "text/plain":
|
||||||
|
elements = partition_text(text=content)
|
||||||
|
|
||||||
return elements
|
header: List[Element] = list()
|
||||||
|
if include_headers:
|
||||||
|
header = partition_email_header(msg)
|
||||||
|
all_elements = header + elements
|
||||||
|
return all_elements
|
||||||
|
|||||||
66
unstructured/partition/text.py
Normal file
66
unstructured/partition/text.py
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
import re
|
||||||
|
from typing import IO, List, Optional
|
||||||
|
|
||||||
|
from unstructured.documents.elements import Element, ListItem, NarrativeText, Title
|
||||||
|
|
||||||
|
from unstructured.cleaners.core import clean_bullets
|
||||||
|
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
|
||||||
|
from unstructured.partition.text_type import (
|
||||||
|
is_possible_narrative_text,
|
||||||
|
is_possible_title,
|
||||||
|
is_bulleted_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def split_by_paragraph(content: str) -> List[str]:
|
||||||
|
return re.split(PARAGRAPH_PATTERN, content)
|
||||||
|
|
||||||
|
|
||||||
|
def partition_text(
|
||||||
|
filename: Optional[str] = None,
|
||||||
|
file: Optional[IO] = None,
|
||||||
|
text: Optional[str] = None,
|
||||||
|
) -> List[Element]:
|
||||||
|
"""Partitions an .txt documents into its constituent elements.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename
|
||||||
|
A string defining the target filename path.
|
||||||
|
file
|
||||||
|
A file-like object using "r" mode --> open(filename, "r").
|
||||||
|
text
|
||||||
|
The string representation of the .txt document.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not any([filename, file, text]):
|
||||||
|
raise ValueError("One of filename, file, or text must be specified.")
|
||||||
|
|
||||||
|
if filename is not None and not file and not text:
|
||||||
|
with open(filename, "r") as f:
|
||||||
|
file_text = f.read()
|
||||||
|
|
||||||
|
elif file is not None and not filename and not text:
|
||||||
|
file_text = file.read()
|
||||||
|
|
||||||
|
elif text is not None and not filename and not file:
|
||||||
|
file_text = str(text)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError("Only one of filename, file, or text can be specified.")
|
||||||
|
|
||||||
|
file_content = split_by_paragraph(file_text)
|
||||||
|
|
||||||
|
elements: List[Element] = list()
|
||||||
|
for ctext in file_content:
|
||||||
|
|
||||||
|
ctext = ctext.strip()
|
||||||
|
|
||||||
|
if ctext == "":
|
||||||
|
break
|
||||||
|
if is_bulleted_text(ctext):
|
||||||
|
elements.append(ListItem(text=clean_bullets(ctext)))
|
||||||
|
elif is_possible_narrative_text(ctext):
|
||||||
|
elements.append(NarrativeText(text=ctext))
|
||||||
|
elif is_possible_title(ctext):
|
||||||
|
elements.append(Title(text=ctext))
|
||||||
|
return elements
|
||||||
Loading…
x
Reference in New Issue
Block a user