feat: Add Image element and find_embedded_image function (#130)

* add find_embedded_image
2025-12-29 16:17:00 +00:00 · 2023-01-09 19:49:19 -06:00 · 2023-01-09 19:49:19 -06:00 · e0feba83f6
commit e0feba83f6
parent 7b3b594ee5
8 changed files with 7754 additions and 19 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -13,6 +13,7 @@
 * Added new functions to extract header information `parse_received_data` and `partition_header`
 * Added new function to parse plain text files `partition_text`
 * Added new cleaners functions `extract_ip_address`, `extract_ip_address_name`, `extract_mapi_id`, `extract_datetimetz`
+* Add new `Image` element and function to find embedded images `find_embedded_images`

 ## 0.3.5

--- a/example-docs/email-with-image.eml
+++ b/example-docs/email-with-image.eml
--- a/example-docs/fake-email-image-embedded.eml
+++ b/example-docs/fake-email-image-embedded.eml
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@ -4,6 +4,10 @@
 #
 #    pip-compile requirements/dev.in
 #
+appnope==0.1.3
+    # via
+    #   ipykernel
+    #   ipython
 argon2-cffi==21.3.0
    # via notebook
 argon2-cffi-bindings==21.2.0
@ -36,6 +40,10 @@ executing==1.0.0
    # via stack-data
 fastjsonschema==2.16.2
    # via nbformat
+importlib-metadata==6.0.0
+    # via nbconvert
+importlib-resources==5.10.2
+    # via jsonschema
 ipykernel==6.15.3
    # via
    #   ipywidgets
@ -45,7 +53,7 @@ ipykernel==6.15.3
    #   qtconsole
 ipython==8.6.0
    # via
-    #   -r dev.in
+    #   -r requirements/dev.in
    #   ipykernel
    #   ipywidgets
    #   jupyter-console
@ -64,7 +72,7 @@ jinja2==3.1.2
 jsonschema==4.16.0
    # via nbformat
 jupyter==1.0.0
-    # via -r dev.in
+    # via -r requirements/dev.in
 jupyter-client==7.3.5
    # via
    #   ipykernel
@ -76,7 +84,7 @@ jupyter-console==6.4.4
    # via jupyter
 jupyter-core==5.1.3
    # via
-    #   -r dev.in
+    #   -r requirements/dev.in
    #   jupyter-client
    #   nbconvert
    #   nbformat
@ -134,7 +142,9 @@ pexpect==4.8.0
 pickleshare==0.7.5
    # via ipython
 pip-tools==6.12.1
-    # via -r dev.in
+    # via -r requirements/dev.in
+pkgutil-resolve-name==1.3.10
+    # via jsonschema
 platformdirs==2.5.4
    # via jupyter-core
 prometheus-client==0.14.1
@ -190,6 +200,10 @@ terminado==0.15.0
    # via notebook
 tinycss2==1.1.1
    # via nbconvert
+tomli==2.0.1
+    # via
+    #   build
+    #   pep517
 tornado==6.2
    # via
    #   ipykernel
@ -217,10 +231,14 @@ webencodings==0.5.1
    #   tinycss2
 wheel==0.38.4
    # via
-    #   -r dev.in
+    #   -r requirements/dev.in
    #   pip-tools
 widgetsnbextension==4.0.3
    # via ipywidgets
+zipp==3.11.0
+    # via
+    #   importlib-metadata
+    #   importlib-resources

 # The following packages are considered to be unsafe in a requirements file:
 # pip
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -7,23 +7,23 @@
 attrs==22.1.0
    # via pytest
 black==22.12.0
-    # via -r test.in
+    # via -r requirements/test.in
 certifi==2022.12.7
    # via
-    #   -r test.in
+    #   -r requirements/test.in
    #   requests
 charset-normalizer==2.1.1
    # via requests
 click==8.1.3
    # via
-    #   -r test.in
+    #   -r requirements/test.in
    #   black
 coverage[toml]==6.4.4
    # via
-    #   -r test.in
+    #   -r requirements/test.in
    #   pytest-cov
 flake8==5.0.4
-    # via -r test.in
+    # via -r requirements/test.in
 idna==3.4
    # via
    #   requests
@ -31,7 +31,7 @@ idna==3.4
 iniconfig==1.1.1
    # via pytest
 label-studio-sdk==0.0.15
-    # via -r test.in
+    # via -r requirements/test.in
 lxml==4.9.1
    # via label-studio-sdk
 mccabe==0.7.0
@ -39,7 +39,7 @@ mccabe==0.7.0
 multidict==6.0.2
    # via yarl
 mypy==0.991
-    # via -r test.in
+    # via -r requirements/test.in
 mypy-extensions==0.4.3
    # via
    #   black
@ -65,7 +65,7 @@ pyparsing==3.0.9
 pytest==7.1.3
    # via pytest-cov
 pytest-cov==4.0.0
-    # via -r test.in
+    # via -r requirements/test.in
 pyyaml==6.0
    # via vcrpy
 requests==2.28.1
@ -73,15 +73,20 @@ requests==2.28.1
 six==1.16.0
    # via vcrpy
 tomli==2.0.1
-    # via pytest
+    # via
+    #   black
+    #   coverage
+    #   mypy
+    #   pytest
 typing-extensions==4.3.0
    # via
+    #   black
    #   mypy
    #   pydantic
 urllib3==1.26.12
    # via requests
 vcrpy==4.2.1
-    # via -r test.in
+    # via -r requirements/test.in
 wrapt==1.14.1
    # via vcrpy
 yarl==1.8.1
--- a/test_unstructured/partition/test_email.py
+++ b/test_unstructured/partition/test_email.py
@ -3,7 +3,7 @@ import os
 import pathlib
 import pytest

-from unstructured.documents.elements import NarrativeText, Title, ListItem
+from unstructured.documents.elements import NarrativeText, Title, ListItem, Image
 from unstructured.documents.email_elements import (
    MetaData,
    Recipient,
@ -27,6 +27,15 @@ EXPECTED_OUTPUT = [
    ListItem(text="Violets are blue"),
 ]

+IMAGE_EXPECTED_OUTPUT = [
+    NarrativeText(text="This is a test email to use for unit tests."),
+    Title(text="Important points:"),
+    NarrativeText(text="hello this is our logo."),
+    Image(text="unstructured_logo.png"),
+    ListItem(text="Roses are red"),
+    ListItem(text="Violets are blue"),
+]
+
 HEADER_EXPECTED_OUTPUT = [
    MetaData(name="MIME-Version", text="1.0"),
    MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
@ -97,6 +106,13 @@ def test_partition_email_from_text():
    assert elements == EXPECTED_OUTPUT


+def test_partition_email_from_filename_with_embedded_image():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-image-embedded.eml")
+    elements = partition_email(filename=filename, content_source="text/plain")
+    assert len(elements) > 0
+    assert elements == IMAGE_EXPECTED_OUTPUT
+
+
 def test_partition_email_header():
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
    with open(filename, "r") as f:
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -72,3 +72,11 @@ class Title(Text):
    category = "Title"

    pass
+
+
+class Image(Text):
+    """A text element for capturing image metadata."""
+
+    category = "Image"
+
+    pass
--- a/unstructured/partition/email.py
+++ b/unstructured/partition/email.py
@ -2,7 +2,7 @@ import email
 import sys
 import re
 from email.message import Message
-from typing import Dict, IO, List, Optional, Tuple
+from typing import Dict, IO, List, Optional, Tuple, Union

 if sys.version_info < (3, 8):
    from typing_extensions import Final
@ -24,7 +24,7 @@ from unstructured.documents.email_elements import (
    ReceivedInfo,
    MetaData,
 )
-from unstructured.documents.elements import Element, Text
+from unstructured.documents.elements import Element, Text, Image, NarrativeText, Title
 from unstructured.partition.html import partition_html
 from unstructured.partition.text import split_by_paragraph, partition_text

@ -113,6 +113,25 @@ def extract_attachment_info(
    return list_attachments


+def has_embedded_image(element):
+
+    PATTERN = re.compile("\[image: .+\]")  # noqa: W605 NOTE(harrell)
+    return PATTERN.search(element.text)
+
+
+def find_embedded_image(
+    element: Union[NarrativeText, Title], indices: re.Match
+) -> Tuple[Element, Element]:
+
+    start, end = indices.start(), indices.end()
+
+    image_raw_info = element.text[start:end]
+    image_info = clean_extra_whitespace(image_raw_info.split(":")[1])
+    element.text = element.text.replace("[image: " + image_info[:-1] + "]", "")
+
+    return Image(text=image_info[:-1]), element
+
+
 def partition_email(
    filename: Optional[str] = None,
    file: Optional[IO] = None,
@ -171,7 +190,7 @@ def partition_email(
        raise ValueError(f"{content_source} content not found in email")

    # NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
-    # looks like the following, resulting in extraneous "=" chracters in the output if
+    # looks like the following, resulting in extraneous "=" characters in the output if
    # you don't clean it up
    # <ul> =
    #    <li>Item 1</li>=
@ -188,6 +207,13 @@ def partition_email(
    elif content_source == "text/plain":
        elements = partition_text(text=content)

+    for idx, element in enumerate(elements):
+        indices = has_embedded_image(element)
+        if (isinstance(element, NarrativeText) or isinstance(element, Title)) and indices:
+            image_info, clean_element = find_embedded_image(element, indices)
+            elements[idx] = clean_element
+            elements.insert(idx + 1, image_info)
+
    header: List[Element] = list()
    if include_headers:
        header = partition_email_header(msg)