diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f1b82b09..2f9820905 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.3.3-dev1 + +* Adds the `partition_email` partitioning brick +* Adds the `replace_mime_encodings` cleaning bricks +* Small fix to HTML parsing related to processing list items with sub-tags + ## 0.3.2 * Added `translate_text` brick for translating text between languages diff --git a/README.md b/README.md index 6fee4a2f4..2e705e3f9 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@
- +

Open-Source Pre-Processing Tools for Unstructured Data

@@ -148,6 +148,48 @@ has an `element` attribute consisting of `Element` objects. Sub-types of the `El represent different components of a document, such as `NarrativeText` and `Title`. You can use these normalized elements to zero in on the components of a document you most care about. +### E-mail Parsing + +The `partition_email` function within `unstructured` is helpful for parsing `.eml` files. Common +e-mail clients such as Microsoft Outlook and Gmail support exproting e-mails as `.eml` files. +`partition_email` accepts filenames, file-like object, and raw text as input. The following +three snippets for parsing `.eml` files are equivalent: + +```python +from unstructured.partition.email import partition_email + +elements = partition_email(filename="example-docs/fake-email.eml") + +with open("example-docs/fake-email.eml", "r") as f: + elements = partition_email(file=f) + +with open("example-docs/fake-email.eml", "r") as f: + text = f.read() +elements = partition_email(text=text) +``` + +The `elements` output will look like the following: + +```python +[, +, +, +] +``` + +Run `print("\n\n".join([str(el) for el in elements]))` to get a string representation of the +output, which looks like: + +```python +This is a test email to use for unit tests. + +Important points: + +Roses are red + +Violets are blue +``` + ## :guardsman: Security Policy See our [security policy](https://github.com/Unstructured-IO/unstructured/security/policy) for diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index da025fc74..ddf64d76f 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -54,6 +54,30 @@ Examples: elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf") +``partition_email`` +--------------------- + +The ``partition_email`` function partitions ``.eml`` documents and works with exports +from email clients such as Microsoft Outlook and Gmail. The ``partition_email`` +takes a filename, file-like object, or raw text as input and produces a list of +document ``Element`` objects as output. + +Examples: + +.. code:: python + + from unstructured.partition.email import partition_email + + elements = partition_email(filename="example-docs/fake-email.eml") + + with open("example-docs/fake-email.eml", "r") as f: + elements = partition_email(file=f) + + with open("example-docs/fake-email.eml", "r") as f: + text = f.read() + elements = partition_email(text=text) + + ``is_bulleted_text`` ---------------------- diff --git a/example-docs/fake-email.eml b/example-docs/fake-email.eml new file mode 100644 index 000000000..702a40852 --- /dev/null +++ b/example-docs/fake-email.eml @@ -0,0 +1,24 @@ +MIME-Version: 1.0 +Date: Fri, 16 Dec 2022 17:04:16 -0500 +Message-ID: +Subject: Test Email +From: Matthew Robinson +To: Matthew Robinson +Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630" + +--00000000000095c9b205eff92630 +Content-Type: text/plain; charset="UTF-8" + +This is a test email to use for unit tests. + +Important points: + + - Roses are red + - Violets are blue + +--00000000000095c9b205eff92630 +Content-Type: text/html; charset="UTF-8" + +
This is a test email to use for unit tests.

Important points:
  • Roses are red
  • Violets are blue
+ +--00000000000095c9b205eff92630-- \ No newline at end of file diff --git a/test_unstructured/cleaners/test_core.py b/test_unstructured/cleaners/test_core.py index d55d7c4e5..9d1906351 100644 --- a/test_unstructured/cleaners/test_core.py +++ b/test_unstructured/cleaners/test_core.py @@ -29,6 +29,14 @@ def test_replace_unicode_quotes(text, expected): assert core.replace_unicode_quotes(text=text) == expected +@pytest.mark.parametrize( + "text, expected", + [("5 w=E2=80=99s", "5 w’s")], +) +def test_replace_mime_encodings(text, expected): + assert core.replace_mime_encodings(text=text) == expected + + @pytest.mark.parametrize( "text, expected", [ diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py new file mode 100644 index 000000000..939c39a19 --- /dev/null +++ b/test_unstructured/partition/test_email.py @@ -0,0 +1,61 @@ +import os +import pathlib +import pytest + +from unstructured.documents.elements import NarrativeText, Title, ListItem +from unstructured.partition.email import partition_email + + +DIRECTORY = pathlib.Path(__file__).parent.resolve() + + +EXPECTED_OUTPUT = [ + NarrativeText(text="This is a test email to use for unit tests."), + Title(text="Important points:"), + ListItem(text="Roses are red"), + ListItem(text="Violets are blue"), +] + + +def test_partition_email_from_filename(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml") + elements = partition_email(filename=filename) + assert len(elements) > 0 + assert elements == EXPECTED_OUTPUT + + +def test_partition_email_from_file(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml") + with open(filename, "r") as f: + elements = partition_email(file=f) + assert len(elements) > 0 + assert elements == EXPECTED_OUTPUT + + +def test_partition_email_from_text(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml") + with open(filename, "r") as f: + text = f.read() + elements = partition_email(text=text) + assert len(elements) > 0 + assert elements == EXPECTED_OUTPUT + + +def test_partition_email_raises_with_none_specified(): + with pytest.raises(ValueError): + partition_email() + + +def test_partition_email_raises_with_too_many_specified(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml") + with open(filename, "r") as f: + text = f.read() + + with pytest.raises(ValueError): + partition_email(filename=filename, text=text) + + +def test_partition_email_raises_with_invalid_content_type(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml") + with pytest.raises(ValueError): + partition_email(filename=filename, content_source="application/json") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 9ed9c2a0e..6b38406da 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.3.2" # pragma: no cover +__version__ = "0.3.3-dev1" # pragma: no cover diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index f5da2a695..2a2ca58b4 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -1,6 +1,8 @@ import re import sys import unicodedata +import quopri + from unstructured.nlp.patterns import UNICODE_BULLETS_RE @@ -81,6 +83,16 @@ def clean_trailing_punctuation(text: str) -> str: return text.strip().rstrip(".,:;") +def replace_mime_encodings(text: str) -> str: + """Replaces MIME encodings with their UTF-8 equivalent characters. + + Example + ------- + 5 w=E2=80-99s -> 5 w’s + """ + return quopri.decodestring(text.encode()).decode("utf-8") + + def clean_prefix(text: str, pattern: str, ignore_case: bool = False, strip: bool = True) -> str: """Removes prefixes from a string according to the specified pattern. Strips leading whitespace if the strip parameter is set to True. diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index a8f5091c3..d37903582 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -225,8 +225,13 @@ def _construct_text(tag_elem: etree.Element) -> str: return text.strip() -def _is_text_tag(tag_elem: etree.Element) -> bool: +def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool: """Deteremines if a tag potentially contains narrative text.""" + # NOTE(robinson) - Only consider elements with limited depth. Otherwise, + # it could be the text representation of a giant div + if len(tag_elem) > max_predecessor_len: + return False + if tag_elem.tag in TEXT_TAGS + HEADING_TAGS: return True @@ -250,7 +255,7 @@ def _process_list_item( we can skip processing if bullets are found in a div element.""" if tag_elem.tag in LIST_ITEM_TAGS: text = _construct_text(tag_elem) - return HTMLListItem(text=text, tag=tag_elem.tag), None + return HTMLListItem(text=text, tag=tag_elem.tag), tag_elem elif tag_elem.tag == "div": text = _construct_text(tag_elem) diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py new file mode 100644 index 000000000..33b98bafc --- /dev/null +++ b/unstructured/partition/email.py @@ -0,0 +1,74 @@ +import email +from typing import Dict, Final, IO, List, Optional + +from unstructured.cleaners.core import replace_mime_encodings +from unstructured.documents.elements import Element, Text +from unstructured.partition.html import partition_html + + +VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html"] + + +def partition_email( + filename: Optional[str] = None, + file: Optional[IO] = None, + text: Optional[str] = None, + content_source: str = "text/html", +) -> List[Element]: + """Partitions an .eml documents into its constituent elements. + Parameters + ---------- + filename + A string defining the target filename path. + file + A file-like object using "r" mode --> open(filename, "r"). + text + The string representation of the .eml document. + """ + if content_source not in VALID_CONTENT_SOURCES: + raise ValueError( + f"{content_source} is not a valid value for content_source. " + f"Valid content sources are: {VALID_CONTENT_SOURCES}" + ) + + if not any([filename, file, text]): + raise ValueError("One of filename, file, or text must be specified.") + + if filename is not None and not file and not text: + with open(filename, "r") as f: + msg = email.message_from_file(f) + + elif file is not None and not filename and not text: + file_text = file.read() + msg = email.message_from_string(file_text) + + elif text is not None and not filename and not file: + _text: str = str(text) + msg = email.message_from_string(_text) + + else: + raise ValueError("Only one of filename, file, or text can be specified.") + + content_map: Dict[str, str] = { + part.get_content_type(): part.get_payload() for part in msg.walk() + } + + content = content_map.get(content_source, "") + if not content: + raise ValueError("text/html content not found in email") + + # NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that + # looks like the following, resulting in extraneous "=" chracters in the output if + # you don't clean it up + #
    = + #
  • Item 1
  • = + #
  • Item 2
  • = + #
+ content = "".join(content.split("=\n")) + + elements = partition_html(text=content) + for element in elements: + if isinstance(element, Text): + element.apply(replace_mime_encodings) + + return elements