mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-28 15:45:21 +00:00
feat: Add Image element and find_embedded_image function (#130)
* add find_embedded_image
This commit is contained in:
parent
7b3b594ee5
commit
e0feba83f6
@ -13,6 +13,7 @@
|
||||
* Added new functions to extract header information `parse_received_data` and `partition_header`
|
||||
* Added new function to parse plain text files `partition_text`
|
||||
* Added new cleaners functions `extract_ip_address`, `extract_ip_address_name`, `extract_mapi_id`, `extract_datetimetz`
|
||||
* Add new `Image` element and function to find embedded images `find_embedded_images`
|
||||
|
||||
## 0.3.5
|
||||
|
||||
|
||||
3828
example-docs/email-with-image.eml
Normal file
3828
example-docs/email-with-image.eml
Normal file
File diff suppressed because it is too large
Load Diff
3833
example-docs/fake-email-image-embedded.eml
Normal file
3833
example-docs/fake-email-image-embedded.eml
Normal file
File diff suppressed because it is too large
Load Diff
@ -4,6 +4,10 @@
|
||||
#
|
||||
# pip-compile requirements/dev.in
|
||||
#
|
||||
appnope==0.1.3
|
||||
# via
|
||||
# ipykernel
|
||||
# ipython
|
||||
argon2-cffi==21.3.0
|
||||
# via notebook
|
||||
argon2-cffi-bindings==21.2.0
|
||||
@ -36,6 +40,10 @@ executing==1.0.0
|
||||
# via stack-data
|
||||
fastjsonschema==2.16.2
|
||||
# via nbformat
|
||||
importlib-metadata==6.0.0
|
||||
# via nbconvert
|
||||
importlib-resources==5.10.2
|
||||
# via jsonschema
|
||||
ipykernel==6.15.3
|
||||
# via
|
||||
# ipywidgets
|
||||
@ -45,7 +53,7 @@ ipykernel==6.15.3
|
||||
# qtconsole
|
||||
ipython==8.6.0
|
||||
# via
|
||||
# -r dev.in
|
||||
# -r requirements/dev.in
|
||||
# ipykernel
|
||||
# ipywidgets
|
||||
# jupyter-console
|
||||
@ -64,7 +72,7 @@ jinja2==3.1.2
|
||||
jsonschema==4.16.0
|
||||
# via nbformat
|
||||
jupyter==1.0.0
|
||||
# via -r dev.in
|
||||
# via -r requirements/dev.in
|
||||
jupyter-client==7.3.5
|
||||
# via
|
||||
# ipykernel
|
||||
@ -76,7 +84,7 @@ jupyter-console==6.4.4
|
||||
# via jupyter
|
||||
jupyter-core==5.1.3
|
||||
# via
|
||||
# -r dev.in
|
||||
# -r requirements/dev.in
|
||||
# jupyter-client
|
||||
# nbconvert
|
||||
# nbformat
|
||||
@ -134,7 +142,9 @@ pexpect==4.8.0
|
||||
pickleshare==0.7.5
|
||||
# via ipython
|
||||
pip-tools==6.12.1
|
||||
# via -r dev.in
|
||||
# via -r requirements/dev.in
|
||||
pkgutil-resolve-name==1.3.10
|
||||
# via jsonschema
|
||||
platformdirs==2.5.4
|
||||
# via jupyter-core
|
||||
prometheus-client==0.14.1
|
||||
@ -190,6 +200,10 @@ terminado==0.15.0
|
||||
# via notebook
|
||||
tinycss2==1.1.1
|
||||
# via nbconvert
|
||||
tomli==2.0.1
|
||||
# via
|
||||
# build
|
||||
# pep517
|
||||
tornado==6.2
|
||||
# via
|
||||
# ipykernel
|
||||
@ -217,10 +231,14 @@ webencodings==0.5.1
|
||||
# tinycss2
|
||||
wheel==0.38.4
|
||||
# via
|
||||
# -r dev.in
|
||||
# -r requirements/dev.in
|
||||
# pip-tools
|
||||
widgetsnbextension==4.0.3
|
||||
# via ipywidgets
|
||||
zipp==3.11.0
|
||||
# via
|
||||
# importlib-metadata
|
||||
# importlib-resources
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
# pip
|
||||
|
||||
@ -7,23 +7,23 @@
|
||||
attrs==22.1.0
|
||||
# via pytest
|
||||
black==22.12.0
|
||||
# via -r test.in
|
||||
# via -r requirements/test.in
|
||||
certifi==2022.12.7
|
||||
# via
|
||||
# -r test.in
|
||||
# -r requirements/test.in
|
||||
# requests
|
||||
charset-normalizer==2.1.1
|
||||
# via requests
|
||||
click==8.1.3
|
||||
# via
|
||||
# -r test.in
|
||||
# -r requirements/test.in
|
||||
# black
|
||||
coverage[toml]==6.4.4
|
||||
# via
|
||||
# -r test.in
|
||||
# -r requirements/test.in
|
||||
# pytest-cov
|
||||
flake8==5.0.4
|
||||
# via -r test.in
|
||||
# via -r requirements/test.in
|
||||
idna==3.4
|
||||
# via
|
||||
# requests
|
||||
@ -31,7 +31,7 @@ idna==3.4
|
||||
iniconfig==1.1.1
|
||||
# via pytest
|
||||
label-studio-sdk==0.0.15
|
||||
# via -r test.in
|
||||
# via -r requirements/test.in
|
||||
lxml==4.9.1
|
||||
# via label-studio-sdk
|
||||
mccabe==0.7.0
|
||||
@ -39,7 +39,7 @@ mccabe==0.7.0
|
||||
multidict==6.0.2
|
||||
# via yarl
|
||||
mypy==0.991
|
||||
# via -r test.in
|
||||
# via -r requirements/test.in
|
||||
mypy-extensions==0.4.3
|
||||
# via
|
||||
# black
|
||||
@ -65,7 +65,7 @@ pyparsing==3.0.9
|
||||
pytest==7.1.3
|
||||
# via pytest-cov
|
||||
pytest-cov==4.0.0
|
||||
# via -r test.in
|
||||
# via -r requirements/test.in
|
||||
pyyaml==6.0
|
||||
# via vcrpy
|
||||
requests==2.28.1
|
||||
@ -73,15 +73,20 @@ requests==2.28.1
|
||||
six==1.16.0
|
||||
# via vcrpy
|
||||
tomli==2.0.1
|
||||
# via pytest
|
||||
# via
|
||||
# black
|
||||
# coverage
|
||||
# mypy
|
||||
# pytest
|
||||
typing-extensions==4.3.0
|
||||
# via
|
||||
# black
|
||||
# mypy
|
||||
# pydantic
|
||||
urllib3==1.26.12
|
||||
# via requests
|
||||
vcrpy==4.2.1
|
||||
# via -r test.in
|
||||
# via -r requirements/test.in
|
||||
wrapt==1.14.1
|
||||
# via vcrpy
|
||||
yarl==1.8.1
|
||||
|
||||
@ -3,7 +3,7 @@ import os
|
||||
import pathlib
|
||||
import pytest
|
||||
|
||||
from unstructured.documents.elements import NarrativeText, Title, ListItem
|
||||
from unstructured.documents.elements import NarrativeText, Title, ListItem, Image
|
||||
from unstructured.documents.email_elements import (
|
||||
MetaData,
|
||||
Recipient,
|
||||
@ -27,6 +27,15 @@ EXPECTED_OUTPUT = [
|
||||
ListItem(text="Violets are blue"),
|
||||
]
|
||||
|
||||
IMAGE_EXPECTED_OUTPUT = [
|
||||
NarrativeText(text="This is a test email to use for unit tests."),
|
||||
Title(text="Important points:"),
|
||||
NarrativeText(text="hello this is our logo."),
|
||||
Image(text="unstructured_logo.png"),
|
||||
ListItem(text="Roses are red"),
|
||||
ListItem(text="Violets are blue"),
|
||||
]
|
||||
|
||||
HEADER_EXPECTED_OUTPUT = [
|
||||
MetaData(name="MIME-Version", text="1.0"),
|
||||
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
|
||||
@ -97,6 +106,13 @@ def test_partition_email_from_text():
|
||||
assert elements == EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_email_from_filename_with_embedded_image():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-image-embedded.eml")
|
||||
elements = partition_email(filename=filename, content_source="text/plain")
|
||||
assert len(elements) > 0
|
||||
assert elements == IMAGE_EXPECTED_OUTPUT
|
||||
|
||||
|
||||
def test_partition_email_header():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
|
||||
with open(filename, "r") as f:
|
||||
|
||||
@ -72,3 +72,11 @@ class Title(Text):
|
||||
category = "Title"
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Image(Text):
|
||||
"""A text element for capturing image metadata."""
|
||||
|
||||
category = "Image"
|
||||
|
||||
pass
|
||||
|
||||
@ -2,7 +2,7 @@ import email
|
||||
import sys
|
||||
import re
|
||||
from email.message import Message
|
||||
from typing import Dict, IO, List, Optional, Tuple
|
||||
from typing import Dict, IO, List, Optional, Tuple, Union
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
from typing_extensions import Final
|
||||
@ -24,7 +24,7 @@ from unstructured.documents.email_elements import (
|
||||
ReceivedInfo,
|
||||
MetaData,
|
||||
)
|
||||
from unstructured.documents.elements import Element, Text
|
||||
from unstructured.documents.elements import Element, Text, Image, NarrativeText, Title
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.partition.text import split_by_paragraph, partition_text
|
||||
|
||||
@ -113,6 +113,25 @@ def extract_attachment_info(
|
||||
return list_attachments
|
||||
|
||||
|
||||
def has_embedded_image(element):
|
||||
|
||||
PATTERN = re.compile("\[image: .+\]") # noqa: W605 NOTE(harrell)
|
||||
return PATTERN.search(element.text)
|
||||
|
||||
|
||||
def find_embedded_image(
|
||||
element: Union[NarrativeText, Title], indices: re.Match
|
||||
) -> Tuple[Element, Element]:
|
||||
|
||||
start, end = indices.start(), indices.end()
|
||||
|
||||
image_raw_info = element.text[start:end]
|
||||
image_info = clean_extra_whitespace(image_raw_info.split(":")[1])
|
||||
element.text = element.text.replace("[image: " + image_info[:-1] + "]", "")
|
||||
|
||||
return Image(text=image_info[:-1]), element
|
||||
|
||||
|
||||
def partition_email(
|
||||
filename: Optional[str] = None,
|
||||
file: Optional[IO] = None,
|
||||
@ -171,7 +190,7 @@ def partition_email(
|
||||
raise ValueError(f"{content_source} content not found in email")
|
||||
|
||||
# NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
|
||||
# looks like the following, resulting in extraneous "=" chracters in the output if
|
||||
# looks like the following, resulting in extraneous "=" characters in the output if
|
||||
# you don't clean it up
|
||||
# <ul> =
|
||||
# <li>Item 1</li>=
|
||||
@ -188,6 +207,13 @@ def partition_email(
|
||||
elif content_source == "text/plain":
|
||||
elements = partition_text(text=content)
|
||||
|
||||
for idx, element in enumerate(elements):
|
||||
indices = has_embedded_image(element)
|
||||
if (isinstance(element, NarrativeText) or isinstance(element, Title)) and indices:
|
||||
image_info, clean_element = find_embedded_image(element, indices)
|
||||
elements[idx] = clean_element
|
||||
elements.insert(idx + 1, image_info)
|
||||
|
||||
header: List[Element] = list()
|
||||
if include_headers:
|
||||
header = partition_email_header(msg)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user