Refactor: support image extraction (#2201)

### Summary
This PR is the second part of the "image extraction" refactor to move it
from unstructured-inference repo to unstructured repo, the first part is
done in
https://github.com/Unstructured-IO/unstructured-inference/pull/299. This
PR adds logic to support extracting images.

### Testing

`git clone -b refactor/remove_image_extraction_code --single-branch
https://github.com/Unstructured-IO/unstructured-inference.git && cd
unstructured-inference && pip install -e . && cd ../`

```
elements = partition_pdf(
        filename="example-docs/embedded-images.pdf",
        strategy="hi_res",
        extract_images_in_pdf=True,
    )

print("\n\n".join([str(el) for el in elements]))
```
This commit is contained in:
Christine Straub 2023-12-05 10:22:29 -08:00 committed by GitHub
parent c5cb216ac8
commit ed76b11b1a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 334 additions and 91 deletions

View File

@ -2,6 +2,7 @@
### Enhancements
* **Refactor image extraction code.** The image extraction code is moved from `unstructured-inference` to `unstructured`.
* **Refactor pdfminer code.** The pdfminer code is moved from `unstructured-inference` to `unstructured`.
### Features

View File

@ -36,7 +36,7 @@ idna==3.6
# requests
imagesize==1.4.1
# via sphinx
importlib-metadata==6.9.0
importlib-metadata==7.0.0
# via sphinx
jinja2==3.1.2
# via

View File

@ -36,7 +36,7 @@ idna==3.6
# requests
imagesize==1.4.1
# via sphinx
importlib-metadata==6.9.0
importlib-metadata==7.0.0
# via sphinx
jinja2==3.1.2
# via

View File

@ -91,7 +91,7 @@ idna==3.6
# anyio
# jsonschema
# requests
importlib-metadata==6.9.0
importlib-metadata==7.0.0
# via
# build
# jupyter-client
@ -167,7 +167,7 @@ jupyter-events==0.9.0
# via jupyter-server
jupyter-lsp==2.2.1
# via jupyterlab
jupyter-server==2.11.1
jupyter-server==2.11.2
# via
# jupyter-lsp
# jupyterlab
@ -198,7 +198,7 @@ mistune==3.0.2
# via nbconvert
nbclient==0.9.0
# via nbconvert
nbconvert==7.11.0
nbconvert==7.12.0
# via
# jupyter
# jupyter-server
@ -290,7 +290,7 @@ pyyaml==6.0.1
# -c test.txt
# jupyter-events
# pre-commit
pyzmq==25.1.1
pyzmq==25.1.2
# via
# ipykernel
# jupyter-client
@ -405,7 +405,7 @@ webencodings==0.5.1
# via
# bleach
# tinycss2
websocket-client==1.6.4
websocket-client==1.7.0
# via jupyter-server
wheel==0.42.0
# via

View File

@ -4,7 +4,7 @@
#
# pip-compile --output-file=extra-markdown.txt extra-markdown.in
#
importlib-metadata==6.9.0
importlib-metadata==7.0.0
# via markdown
markdown==3.5.1
# via -r extra-markdown.in

View File

@ -45,7 +45,7 @@ flask==3.0.0
# visualdl
flask-babel==4.0.0
# via visualdl
fonttools==4.45.1
fonttools==4.46.0
# via matplotlib
future==0.18.3
# via bce-python-sdk
@ -59,7 +59,7 @@ imageio==2.33.0
# scikit-image
imgaug==0.4.0
# via unstructured-paddleocr
importlib-metadata==6.9.0
importlib-metadata==7.0.0
# via flask
importlib-resources==6.1.1
# via matplotlib

View File

@ -8,7 +8,7 @@ pikepdf
pypdf
# Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is.
unstructured-inference==0.7.17
unstructured-inference==0.7.18
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
# from one tesseract call
unstructured.pytesseract>=0.3.12

View File

@ -37,7 +37,7 @@ filelock==3.13.1
# transformers
flatbuffers==23.5.26
# via onnxruntime
fonttools==4.45.1
fonttools==4.46.0
# via matplotlib
fsspec==2023.9.1
# via
@ -134,7 +134,7 @@ pdfminer-six==20221105
# pdfplumber
pdfplumber==0.10.3
# via layoutparser
pikepdf==8.7.1
pikepdf==8.8.0
# via -r extra-pdf-image.in
pillow==10.0.1
# via
@ -250,7 +250,7 @@ typing-extensions==4.8.0
# torch
tzdata==2023.3
# via pandas
unstructured-inference==0.7.17
unstructured-inference==0.7.18
# via -r extra-pdf-image.in
unstructured-pytesseract==0.3.12
# via

View File

@ -60,7 +60,7 @@ idna==3.6
# yarl
isodate==0.6.1
# via azure-storage-blob
msal==1.25.0
msal==1.26.0
# via
# azure-datalake-store
# azure-identity

View File

@ -4,7 +4,7 @@
#
# pip-compile --output-file=ingest/delta-table.txt ingest/delta-table.in
#
deltalake==0.13.0
deltalake==0.14.0
# via -r ingest/delta-table.in
fsspec==2023.9.1
# via

View File

@ -64,11 +64,11 @@ jsonpatch==1.33
# langchain-core
jsonpointer==2.4
# via jsonpatch
langchain==0.0.344
langchain==0.0.345
# via -r ingest/embed-aws-bedrock.in
langchain-core==0.0.8
langchain-core==0.0.9
# via langchain
langsmith==0.0.68
langsmith==0.0.69
# via
# langchain
# langchain-core

View File

@ -79,11 +79,11 @@ jsonpatch==1.33
# langchain-core
jsonpointer==2.4
# via jsonpatch
langchain==0.0.344
langchain==0.0.345
# via -r ingest/embed-huggingface.in
langchain-core==0.0.8
langchain-core==0.0.9
# via langchain
langsmith==0.0.68
langsmith==0.0.69
# via
# langchain
# langchain-core

View File

@ -64,11 +64,11 @@ jsonpatch==1.33
# langchain-core
jsonpointer==2.4
# via jsonpatch
langchain==0.0.344
langchain==0.0.345
# via -r ingest/embed-openai.in
langchain-core==0.0.8
langchain-core==0.0.9
# via langchain
langsmith==0.0.68
langsmith==0.0.69
# via
# langchain
# langchain-core
@ -125,7 +125,7 @@ tenacity==8.2.3
# via
# langchain
# langchain-core
tiktoken==0.5.1
tiktoken==0.5.2
# via -r ingest/embed-openai.in
tqdm==4.66.1
# via

View File

@ -29,7 +29,7 @@ idna==3.6
# via
# -c ingest/../base.txt
# requests
msal==1.25.0
msal==1.26.0
# via
# -r ingest/onedrive.in
# office365-rest-python-client

View File

@ -23,7 +23,7 @@ idna==3.6
# via
# -c ingest/../base.txt
# requests
msal==1.25.0
msal==1.26.0
# via
# -r ingest/outlook.in
# office365-rest-python-client

View File

@ -33,5 +33,5 @@ urllib3==1.26.18
# -c ingest/../base.txt
# -c ingest/../constraints.in
# requests
websocket-client==1.6.4
websocket-client==1.7.0
# via praw

View File

@ -23,7 +23,7 @@ idna==3.6
# via
# -c ingest/../base.txt
# requests
msal==1.25.0
msal==1.26.0
# via
# -r ingest/sharepoint.in
# office365-rest-python-client

View File

@ -4,5 +4,5 @@
#
# pip-compile --output-file=ingest/slack.txt ingest/slack.in
#
slack-sdk==3.26.0
slack-sdk==3.26.1
# via -r ingest/slack.in

View File

@ -2,44 +2,41 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile --constraint=requirements/constraints.in requirements/ingest/weaviate.in
# pip-compile --output-file=ingest/weaviate.txt ingest/weaviate.in
#
authlib==1.2.1
# via weaviate-client
certifi==2023.11.17
# via
# -c requirements/constraints.in
# -c requirements/ingest/../base.txt
# -c requirements/ingest/../constraints.in
# -c ingest/../base.txt
# -c ingest/../constraints.in
# requests
cffi==1.16.0
# via cryptography
charset-normalizer==3.3.2
# via
# -c requirements/ingest/../base.txt
# -c ingest/../base.txt
# requests
cryptography==41.0.5
cryptography==41.0.7
# via authlib
idna==3.4
idna==3.6
# via
# -c requirements/ingest/../base.txt
# -c ingest/../base.txt
# requests
pycparser==2.21
# via cffi
requests==2.31.0
# via
# -c requirements/ingest/../base.txt
# -c ingest/../base.txt
# weaviate-client
urllib3==1.26.18
# via
# -c requirements/constraints.in
# -c requirements/ingest/../base.txt
# -c requirements/ingest/../constraints.in
# -c ingest/../base.txt
# -c ingest/../constraints.in
# requests
validators==0.22.0
# via weaviate-client
weaviate-client==3.25.3
# via
# -c requirements/constraints.in
# -c requirements/ingest/../constraints.in
# -r requirements/ingest/weaviate.in
# -c ingest/../constraints.in
# -r ingest/weaviate.in

View File

@ -36,7 +36,7 @@ flake8==6.1.0
# flake8-print
flake8-print==5.0.0
# via -r test.in
freezegun==1.2.2
freezegun==1.3.1
# via -r test.in
grpcio==1.59.3
# via -r test.in
@ -111,7 +111,7 @@ requests==2.31.0
# via
# -c base.txt
# label-studio-sdk
ruff==0.1.6
ruff==0.1.7
# via -r test.in
six==1.16.0
# via

View File

@ -0,0 +1,115 @@
import os
import tempfile
import numpy as np
import pytest
from PIL import Image as PILImg
from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import ElementMetadata, Image
from unstructured.partition.pdf_image import pdf_image_utils
@pytest.mark.parametrize("image_type", ["pil", "numpy_array"])
def test_write_image(image_type):
mock_pil_image = PILImg.new("RGB", (50, 50))
mock_numpy_image = np.zeros((50, 50, 3), np.uint8)
image_map = {
"pil": mock_pil_image,
"numpy_array": mock_numpy_image,
}
image = image_map[image_type]
with tempfile.TemporaryDirectory() as tmpdir:
output_image_path = os.path.join(tmpdir, "test_image.jpg")
pdf_image_utils.write_image(image, output_image_path)
assert os.path.exists(output_image_path)
# Additional check to see if the written image can be read
read_image = PILImg.open(output_image_path)
assert read_image is not None
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("path_only", [True, False])
def test_convert_pdf_to_image(
file_mode, path_only, filename=example_doc_path("embedded-images.pdf")
):
with tempfile.TemporaryDirectory() as tmpdir:
if file_mode == "filename":
images = pdf_image_utils.convert_pdf_to_image(
filename=filename,
file=None,
output_folder=tmpdir,
path_only=path_only,
)
else:
with open(filename, "rb") as f:
images = pdf_image_utils.convert_pdf_to_image(
filename="",
file=f,
output_folder=tmpdir,
path_only=path_only,
)
if path_only:
assert isinstance(images[0], str)
else:
assert isinstance(images[0], PILImg.Image)
def test_extract_images_from_elements(filename=example_doc_path("embedded-images.pdf")):
with tempfile.TemporaryDirectory() as tmpdir:
elements = [
Image(
text="3",
coordinates=(
(78.7401411111111, 86.61545694444455),
(78.7401411111111, 519.9487805555556),
(512.0734647222223, 519.9487805555556),
(512.0734647222223, 86.61545694444455),
),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
Image(
text="4",
coordinates=(
(570.8661397222222, 86.6154566666667),
(570.8661397222222, 519.6862825000001),
(1003.9369655555556, 519.6862825000001),
(1003.9369655555556, 86.6154566666667),
),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
Image(
text="5",
coordinates=(
(1062.9921808333331, 86.61545694444455),
(1062.9921808333331, 519.9487805555556),
(1496.3255044444445, 519.9487805555556),
(1496.3255044444445, 86.61545694444455),
),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
]
pdf_image_utils.extract_images_from_elements(
elements=elements, pdf_image_dpi=200, filename=filename, output_dir_path=str(tmpdir)
)
for i, el in enumerate(elements):
expected_image_path = os.path.join(
str(tmpdir), f"figure-{el.metadata.page_number}-{i + 1}.jpg"
)
assert os.path.isfile(el.metadata.image_path)
assert el.metadata.image_path == expected_image_path
def test_write_image_raises_error():
with pytest.raises(ValueError):
pdf_image_utils.write_image("invalid_type", "test_image.jpg")

View File

@ -555,6 +555,44 @@ def _add_regex_metadata(
return elements
class ElementType:
TITLE = "Title"
TEXT = "Text"
UNCATEGORIZED_TEXT = "UncategorizedText"
NARRATIVE_TEXT = "NarrativeText"
BULLETED_TEXT = "BulletedText"
ABSTRACT = "Abstract"
THREADING = "Threading"
FORM = "Form"
FIELD_NAME = "Field-Name"
VALUE = "Value"
LINK = "Link"
COMPOSITE_ELEMENT = "CompositeElement"
IMAGE = "Image"
PICTURE = "Picture"
FIGURE_CAPTION = "FigureCaption"
FIGURE = "Figure"
CAPTION = "Caption"
LIST = "List"
LIST_ITEM = "ListItem"
LIST_ITEM_OTHER = "List-item"
CHECKED = "Checked"
UNCHECKED = "Unchecked"
ADDRESS = "Address"
EMAIL_ADDRESS = "EmailAddress"
PAGE_BREAK = "PageBreak"
FORMULA = "Formula"
TABLE = "Table"
HEADER = "Header"
HEADLINE = "Headline"
SUB_HEADLINE = "Subheadline"
PAGE_HEADER = "Page-header" # Title?
SECTION_HEADER = "Section-header"
FOOTER = "Footer"
FOOTNOTE = "Footnote"
PAGE_FOOTER = "Page-footer"
class Element(abc.ABC):
"""An element is a section of a page in the document."""
@ -764,7 +802,7 @@ class EmailAddress(Text):
class Image(Text):
"""A text element for capturing image metadata."""
category = "Image"
category = ElementType.IMAGE
class PageBreak(Text):
@ -797,44 +835,6 @@ class Footer(Text):
category = "Footer"
class ElementType:
TITLE = "Title"
TEXT = "Text"
UNCATEGORIZED_TEXT = "UncategorizedText"
NARRATIVE_TEXT = "NarrativeText"
BULLETED_TEXT = "BulletedText"
ABSTRACT = "Abstract"
THREADING = "Threading"
FORM = "Form"
FIELD_NAME = "Field-Name"
VALUE = "Value"
LINK = "Link"
COMPOSITE_ELEMENT = "CompositeElement"
IMAGE = "Image"
PICTURE = "Picture"
FIGURE_CAPTION = "FigureCaption"
FIGURE = "Figure"
CAPTION = "Caption"
LIST = "List"
LIST_ITEM = "ListItem"
LIST_ITEM_OTHER = "List-item"
CHECKED = "Checked"
UNCHECKED = "Unchecked"
ADDRESS = "Address"
EMAIL_ADDRESS = "EmailAddress"
PAGE_BREAK = "PageBreak"
FORMULA = "Formula"
TABLE = "Table"
HEADER = "Header"
HEADLINE = "Headline"
SUB_HEADLINE = "Subheadline"
PAGE_HEADER = "Page-header" # Title?
SECTION_HEADER = "Section-header"
FOOTER = "Footer"
FOOTNOTE = "Footnote"
PAGE_FOOTER = "Page-footer"
TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
ElementType.TITLE: Title,
ElementType.SECTION_HEADER: Title,

View File

@ -70,6 +70,7 @@ from unstructured.partition.lang import (
check_languages,
prepare_languages_for_tesseract,
)
from unstructured.partition.pdf_image.pdf_image_utils import extract_images_from_elements
from unstructured.partition.pdf_image.pdfminer_utils import (
open_pdfminer_pages_generator,
rect_to_bbox,
@ -381,8 +382,6 @@ def _partition_pdf_or_image_local(
is_image=is_image,
model_name=model_name,
pdf_image_dpi=pdf_image_dpi,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
)
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
@ -411,8 +410,6 @@ def _partition_pdf_or_image_local(
is_image=is_image,
model_name=model_name,
pdf_image_dpi=pdf_image_dpi,
extract_images_in_pdf=extract_images_in_pdf,
image_output_dir_path=image_output_dir_path,
)
if hasattr(file, "seek"):
file.seek(0)
@ -458,6 +455,15 @@ def _partition_pdf_or_image_local(
**kwargs,
)
if extract_images_in_pdf:
extract_images_from_elements(
elements=elements,
filename=filename,
file=file,
pdf_image_dpi=pdf_image_dpi,
output_dir_path=image_output_dir_path,
)
out_elements = []
for el in elements:
if isinstance(el, PageBreak) and not include_page_breaks:

View File

@ -0,0 +1,124 @@
import os
import tempfile
from pathlib import PurePath
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
import cv2
import numpy as np
import pdf2image
from PIL import Image
from unstructured.documents.elements import ElementType
from unstructured.logger import logger
from unstructured.partition.common import convert_to_bytes
if TYPE_CHECKING:
from unstructured.documents.elements import Element
def write_image(image: Union[Image.Image, np.ndarray], output_image_path: str):
"""
Write an image to a specified file path, supporting both PIL Image and numpy ndarray formats.
Parameters:
- image (Union[Image.Image, np.ndarray]): The image to be written, which can be in PIL Image
format or a numpy ndarray format.
- output_image_path (str): The path to which the image will be written.
Raises:
- ValueError: If the provided image type is neither PIL Image nor numpy ndarray.
Returns:
- None: The function writes the image to the specified path but does not return any value.
"""
if isinstance(image, Image.Image):
image.save(output_image_path)
elif isinstance(image, np.ndarray):
cv2.imwrite(output_image_path, image)
else:
raise ValueError("Unsupported Image Type")
def convert_pdf_to_image(
filename: str,
file: Optional[Union[bytes, BinaryIO]] = None,
dpi: int = 200,
output_folder: Optional[Union[str, PurePath]] = None,
path_only: bool = False,
) -> Union[List[Image.Image], List[str]]:
"""Get the image renderings of the pdf pages using pdf2image"""
if path_only and not output_folder:
raise ValueError("output_folder must be specified if path_only is true")
if file is not None:
f_bytes = convert_to_bytes(file)
images = pdf2image.convert_from_bytes(
f_bytes,
dpi=dpi,
output_folder=output_folder,
paths_only=path_only,
)
else:
images = pdf2image.convert_from_path(
filename,
dpi=dpi,
output_folder=output_folder,
paths_only=path_only,
)
return images
def extract_images_from_elements(
elements: List["Element"],
pdf_image_dpi: int,
filename: str = "",
file: Optional[Union[bytes, BinaryIO]] = None,
output_dir_path: Optional[str] = None,
):
"""
Extract and save images from the page. This method iterates through the layout elements
of the page, identifies image regions, and extracts and saves them as separate image files.
"""
if not output_dir_path:
output_dir_path = os.path.join(os.getcwd(), "figures")
os.makedirs(output_dir_path, exist_ok=True)
with tempfile.TemporaryDirectory() as temp_dir:
_image_paths = convert_pdf_to_image(
filename,
file,
pdf_image_dpi,
output_folder=temp_dir,
path_only=True,
)
image_paths = cast(List[str], _image_paths)
figure_number = 0
for el in elements:
coordinates = el.metadata.coordinates
if not coordinates or not coordinates.points or el.category != ElementType.IMAGE:
continue
points = coordinates.points
x1, y1 = points[0]
x2, y2 = points[2]
page_number = el.metadata.page_number
figure_number += 1
try:
output_f_path = os.path.join(
output_dir_path,
f"figure-{page_number}-{figure_number}.jpg",
)
image_path = image_paths[page_number - 1]
image = Image.open(image_path)
cropped_image = image.crop((x1, y1, x2, y2))
write_image(cropped_image, output_f_path)
# add image path to element metadata
el.metadata.image_path = output_f_path
except (ValueError, IOError):
logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)