diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9f1b82b09..2f9820905 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## 0.3.3-dev1
+
+* Adds the `partition_email` partitioning brick
+* Adds the `replace_mime_encodings` cleaning bricks
+* Small fix to HTML parsing related to processing list items with sub-tags
+
## 0.3.2
* Added `translate_text` brick for translating text between languages
diff --git a/README.md b/README.md
index 6fee4a2f4..2e705e3f9 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@
-
+
Open-Source Pre-Processing Tools for Unstructured Data
@@ -148,6 +148,48 @@ has an `element` attribute consisting of `Element` objects. Sub-types of the `El
represent different components of a document, such as `NarrativeText` and `Title`. You can use
these normalized elements to zero in on the components of a document you most care about.
+### E-mail Parsing
+
+The `partition_email` function within `unstructured` is helpful for parsing `.eml` files. Common
+e-mail clients such as Microsoft Outlook and Gmail support exproting e-mails as `.eml` files.
+`partition_email` accepts filenames, file-like object, and raw text as input. The following
+three snippets for parsing `.eml` files are equivalent:
+
+```python
+from unstructured.partition.email import partition_email
+
+elements = partition_email(filename="example-docs/fake-email.eml")
+
+with open("example-docs/fake-email.eml", "r") as f:
+ elements = partition_email(file=f)
+
+with open("example-docs/fake-email.eml", "r") as f:
+ text = f.read()
+elements = partition_email(text=text)
+```
+
+The `elements` output will look like the following:
+
+```python
+[,
+,
+,
+]
+```
+
+Run `print("\n\n".join([str(el) for el in elements]))` to get a string representation of the
+output, which looks like:
+
+```python
+This is a test email to use for unit tests.
+
+Important points:
+
+Roses are red
+
+Violets are blue
+```
+
## :guardsman: Security Policy
See our [security policy](https://github.com/Unstructured-IO/unstructured/security/policy) for
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
index da025fc74..ddf64d76f 100644
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@@ -54,6 +54,30 @@ Examples:
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
+``partition_email``
+---------------------
+
+The ``partition_email`` function partitions ``.eml`` documents and works with exports
+from email clients such as Microsoft Outlook and Gmail. The ``partition_email``
+takes a filename, file-like object, or raw text as input and produces a list of
+document ``Element`` objects as output.
+
+Examples:
+
+.. code:: python
+
+ from unstructured.partition.email import partition_email
+
+ elements = partition_email(filename="example-docs/fake-email.eml")
+
+ with open("example-docs/fake-email.eml", "r") as f:
+ elements = partition_email(file=f)
+
+ with open("example-docs/fake-email.eml", "r") as f:
+ text = f.read()
+ elements = partition_email(text=text)
+
+
``is_bulleted_text``
----------------------
diff --git a/example-docs/fake-email.eml b/example-docs/fake-email.eml
new file mode 100644
index 000000000..702a40852
--- /dev/null
+++ b/example-docs/fake-email.eml
@@ -0,0 +1,24 @@
+MIME-Version: 1.0
+Date: Fri, 16 Dec 2022 17:04:16 -0500
+Message-ID:
+Subject: Test Email
+From: Matthew Robinson
+To: Matthew Robinson
+Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
+
+--00000000000095c9b205eff92630
+Content-Type: text/plain; charset="UTF-8"
+
+This is a test email to use for unit tests.
+
+Important points:
+
+ - Roses are red
+ - Violets are blue
+
+--00000000000095c9b205eff92630
+Content-Type: text/html; charset="UTF-8"
+
+This is a test email to use for unit tests.
Important points:
- Roses are red
- Violets are blue
+
+--00000000000095c9b205eff92630--
\ No newline at end of file
diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py
index d55d7c4e5..9d1906351 100644
--- a/test_unstructured/cleaners/test_core.py
+++ b/test_unstructured/cleaners/test_core.py
@@ -29,6 +29,14 @@ def test_replace_unicode_quotes(text, expected):
assert core.replace_unicode_quotes(text=text) == expected
+@pytest.mark.parametrize(
+ "text, expected",
+ [("5 w=E2=80=99s", "5 w’s")],
+)
+def test_replace_mime_encodings(text, expected):
+ assert core.replace_mime_encodings(text=text) == expected
+
+
@pytest.mark.parametrize(
"text, expected",
[
diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py
new file mode 100644
index 000000000..939c39a19
--- /dev/null
+++ b/test_unstructured/partition/test_email.py
@@ -0,0 +1,61 @@
+import os
+import pathlib
+import pytest
+
+from unstructured.documents.elements import NarrativeText, Title, ListItem
+from unstructured.partition.email import partition_email
+
+
+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+
+
+EXPECTED_OUTPUT = [
+ NarrativeText(text="This is a test email to use for unit tests."),
+ Title(text="Important points:"),
+ ListItem(text="Roses are red"),
+ ListItem(text="Violets are blue"),
+]
+
+
+def test_partition_email_from_filename():
+ filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
+ elements = partition_email(filename=filename)
+ assert len(elements) > 0
+ assert elements == EXPECTED_OUTPUT
+
+
+def test_partition_email_from_file():
+ filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
+ with open(filename, "r") as f:
+ elements = partition_email(file=f)
+ assert len(elements) > 0
+ assert elements == EXPECTED_OUTPUT
+
+
+def test_partition_email_from_text():
+ filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
+ with open(filename, "r") as f:
+ text = f.read()
+ elements = partition_email(text=text)
+ assert len(elements) > 0
+ assert elements == EXPECTED_OUTPUT
+
+
+def test_partition_email_raises_with_none_specified():
+ with pytest.raises(ValueError):
+ partition_email()
+
+
+def test_partition_email_raises_with_too_many_specified():
+ filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
+ with open(filename, "r") as f:
+ text = f.read()
+
+ with pytest.raises(ValueError):
+ partition_email(filename=filename, text=text)
+
+
+def test_partition_email_raises_with_invalid_content_type():
+ filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
+ with pytest.raises(ValueError):
+ partition_email(filename=filename, content_source="application/json")
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 9ed9c2a0e..6b38406da 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.3.2" # pragma: no cover
+__version__ = "0.3.3-dev1" # pragma: no cover
diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py
index f5da2a695..2a2ca58b4 100644
--- a/unstructured/cleaners/core.py
+++ b/unstructured/cleaners/core.py
@@ -1,6 +1,8 @@
import re
import sys
import unicodedata
+import quopri
+
from unstructured.nlp.patterns import UNICODE_BULLETS_RE
@@ -81,6 +83,16 @@ def clean_trailing_punctuation(text: str) -> str:
return text.strip().rstrip(".,:;")
+def replace_mime_encodings(text: str) -> str:
+ """Replaces MIME encodings with their UTF-8 equivalent characters.
+
+ Example
+ -------
+ 5 w=E2=80-99s -> 5 w’s
+ """
+ return quopri.decodestring(text.encode()).decode("utf-8")
+
+
def clean_prefix(text: str, pattern: str, ignore_case: bool = False, strip: bool = True) -> str:
"""Removes prefixes from a string according to the specified pattern. Strips leading
whitespace if the strip parameter is set to True.
diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py
index a8f5091c3..d37903582 100644
--- a/unstructured/documents/html.py
+++ b/unstructured/documents/html.py
@@ -225,8 +225,13 @@ def _construct_text(tag_elem: etree.Element) -> str:
return text.strip()
-def _is_text_tag(tag_elem: etree.Element) -> bool:
+def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool:
"""Deteremines if a tag potentially contains narrative text."""
+ # NOTE(robinson) - Only consider elements with limited depth. Otherwise,
+ # it could be the text representation of a giant div
+ if len(tag_elem) > max_predecessor_len:
+ return False
+
if tag_elem.tag in TEXT_TAGS + HEADING_TAGS:
return True
@@ -250,7 +255,7 @@ def _process_list_item(
we can skip processing if bullets are found in a div element."""
if tag_elem.tag in LIST_ITEM_TAGS:
text = _construct_text(tag_elem)
- return HTMLListItem(text=text, tag=tag_elem.tag), None
+ return HTMLListItem(text=text, tag=tag_elem.tag), tag_elem
elif tag_elem.tag == "div":
text = _construct_text(tag_elem)
diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py
new file mode 100644
index 000000000..33b98bafc
--- /dev/null
+++ b/unstructured/partition/email.py
@@ -0,0 +1,74 @@
+import email
+from typing import Dict, Final, IO, List, Optional
+
+from unstructured.cleaners.core import replace_mime_encodings
+from unstructured.documents.elements import Element, Text
+from unstructured.partition.html import partition_html
+
+
+VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html"]
+
+
+def partition_email(
+ filename: Optional[str] = None,
+ file: Optional[IO] = None,
+ text: Optional[str] = None,
+ content_source: str = "text/html",
+) -> List[Element]:
+ """Partitions an .eml documents into its constituent elements.
+ Parameters
+ ----------
+ filename
+ A string defining the target filename path.
+ file
+ A file-like object using "r" mode --> open(filename, "r").
+ text
+ The string representation of the .eml document.
+ """
+ if content_source not in VALID_CONTENT_SOURCES:
+ raise ValueError(
+ f"{content_source} is not a valid value for content_source. "
+ f"Valid content sources are: {VALID_CONTENT_SOURCES}"
+ )
+
+ if not any([filename, file, text]):
+ raise ValueError("One of filename, file, or text must be specified.")
+
+ if filename is not None and not file and not text:
+ with open(filename, "r") as f:
+ msg = email.message_from_file(f)
+
+ elif file is not None and not filename and not text:
+ file_text = file.read()
+ msg = email.message_from_string(file_text)
+
+ elif text is not None and not filename and not file:
+ _text: str = str(text)
+ msg = email.message_from_string(_text)
+
+ else:
+ raise ValueError("Only one of filename, file, or text can be specified.")
+
+ content_map: Dict[str, str] = {
+ part.get_content_type(): part.get_payload() for part in msg.walk()
+ }
+
+ content = content_map.get(content_source, "")
+ if not content:
+ raise ValueError("text/html content not found in email")
+
+ # NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
+ # looks like the following, resulting in extraneous "=" chracters in the output if
+ # you don't clean it up
+ # =
+ # - Item 1
=
+ # - Item 2
- =
+ #
+ content = "".join(content.split("=\n"))
+
+ elements = partition_html(text=content)
+ for element in elements:
+ if isinstance(element, Text):
+ element.apply(replace_mime_encodings)
+
+ return elements