mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-08 14:39:27 +00:00
Refactor: support image extraction (#2201)
### Summary This PR is the second part of the "image extraction" refactor to move it from unstructured-inference repo to unstructured repo, the first part is done in https://github.com/Unstructured-IO/unstructured-inference/pull/299. This PR adds logic to support extracting images. ### Testing `git clone -b refactor/remove_image_extraction_code --single-branch https://github.com/Unstructured-IO/unstructured-inference.git && cd unstructured-inference && pip install -e . && cd ../` ``` elements = partition_pdf( filename="example-docs/embedded-images.pdf", strategy="hi_res", extract_images_in_pdf=True, ) print("\n\n".join([str(el) for el in elements])) ```
This commit is contained in:
parent
c5cb216ac8
commit
ed76b11b1a
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
* **Refactor image extraction code.** The image extraction code is moved from `unstructured-inference` to `unstructured`.
|
||||||
* **Refactor pdfminer code.** The pdfminer code is moved from `unstructured-inference` to `unstructured`.
|
* **Refactor pdfminer code.** The pdfminer code is moved from `unstructured-inference` to `unstructured`.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|||||||
@ -36,7 +36,7 @@ idna==3.6
|
|||||||
# requests
|
# requests
|
||||||
imagesize==1.4.1
|
imagesize==1.4.1
|
||||||
# via sphinx
|
# via sphinx
|
||||||
importlib-metadata==6.9.0
|
importlib-metadata==7.0.0
|
||||||
# via sphinx
|
# via sphinx
|
||||||
jinja2==3.1.2
|
jinja2==3.1.2
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -36,7 +36,7 @@ idna==3.6
|
|||||||
# requests
|
# requests
|
||||||
imagesize==1.4.1
|
imagesize==1.4.1
|
||||||
# via sphinx
|
# via sphinx
|
||||||
importlib-metadata==6.9.0
|
importlib-metadata==7.0.0
|
||||||
# via sphinx
|
# via sphinx
|
||||||
jinja2==3.1.2
|
jinja2==3.1.2
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -91,7 +91,7 @@ idna==3.6
|
|||||||
# anyio
|
# anyio
|
||||||
# jsonschema
|
# jsonschema
|
||||||
# requests
|
# requests
|
||||||
importlib-metadata==6.9.0
|
importlib-metadata==7.0.0
|
||||||
# via
|
# via
|
||||||
# build
|
# build
|
||||||
# jupyter-client
|
# jupyter-client
|
||||||
@ -167,7 +167,7 @@ jupyter-events==0.9.0
|
|||||||
# via jupyter-server
|
# via jupyter-server
|
||||||
jupyter-lsp==2.2.1
|
jupyter-lsp==2.2.1
|
||||||
# via jupyterlab
|
# via jupyterlab
|
||||||
jupyter-server==2.11.1
|
jupyter-server==2.11.2
|
||||||
# via
|
# via
|
||||||
# jupyter-lsp
|
# jupyter-lsp
|
||||||
# jupyterlab
|
# jupyterlab
|
||||||
@ -198,7 +198,7 @@ mistune==3.0.2
|
|||||||
# via nbconvert
|
# via nbconvert
|
||||||
nbclient==0.9.0
|
nbclient==0.9.0
|
||||||
# via nbconvert
|
# via nbconvert
|
||||||
nbconvert==7.11.0
|
nbconvert==7.12.0
|
||||||
# via
|
# via
|
||||||
# jupyter
|
# jupyter
|
||||||
# jupyter-server
|
# jupyter-server
|
||||||
@ -290,7 +290,7 @@ pyyaml==6.0.1
|
|||||||
# -c test.txt
|
# -c test.txt
|
||||||
# jupyter-events
|
# jupyter-events
|
||||||
# pre-commit
|
# pre-commit
|
||||||
pyzmq==25.1.1
|
pyzmq==25.1.2
|
||||||
# via
|
# via
|
||||||
# ipykernel
|
# ipykernel
|
||||||
# jupyter-client
|
# jupyter-client
|
||||||
@ -405,7 +405,7 @@ webencodings==0.5.1
|
|||||||
# via
|
# via
|
||||||
# bleach
|
# bleach
|
||||||
# tinycss2
|
# tinycss2
|
||||||
websocket-client==1.6.4
|
websocket-client==1.7.0
|
||||||
# via jupyter-server
|
# via jupyter-server
|
||||||
wheel==0.42.0
|
wheel==0.42.0
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
#
|
#
|
||||||
# pip-compile --output-file=extra-markdown.txt extra-markdown.in
|
# pip-compile --output-file=extra-markdown.txt extra-markdown.in
|
||||||
#
|
#
|
||||||
importlib-metadata==6.9.0
|
importlib-metadata==7.0.0
|
||||||
# via markdown
|
# via markdown
|
||||||
markdown==3.5.1
|
markdown==3.5.1
|
||||||
# via -r extra-markdown.in
|
# via -r extra-markdown.in
|
||||||
|
|||||||
@ -45,7 +45,7 @@ flask==3.0.0
|
|||||||
# visualdl
|
# visualdl
|
||||||
flask-babel==4.0.0
|
flask-babel==4.0.0
|
||||||
# via visualdl
|
# via visualdl
|
||||||
fonttools==4.45.1
|
fonttools==4.46.0
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
future==0.18.3
|
future==0.18.3
|
||||||
# via bce-python-sdk
|
# via bce-python-sdk
|
||||||
@ -59,7 +59,7 @@ imageio==2.33.0
|
|||||||
# scikit-image
|
# scikit-image
|
||||||
imgaug==0.4.0
|
imgaug==0.4.0
|
||||||
# via unstructured-paddleocr
|
# via unstructured-paddleocr
|
||||||
importlib-metadata==6.9.0
|
importlib-metadata==7.0.0
|
||||||
# via flask
|
# via flask
|
||||||
importlib-resources==6.1.1
|
importlib-resources==6.1.1
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
|
|||||||
@ -8,7 +8,7 @@ pikepdf
|
|||||||
pypdf
|
pypdf
|
||||||
# Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
|
# Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
|
||||||
# when unstructured library is.
|
# when unstructured library is.
|
||||||
unstructured-inference==0.7.17
|
unstructured-inference==0.7.18
|
||||||
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
|
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
|
||||||
# from one tesseract call
|
# from one tesseract call
|
||||||
unstructured.pytesseract>=0.3.12
|
unstructured.pytesseract>=0.3.12
|
||||||
|
|||||||
@ -37,7 +37,7 @@ filelock==3.13.1
|
|||||||
# transformers
|
# transformers
|
||||||
flatbuffers==23.5.26
|
flatbuffers==23.5.26
|
||||||
# via onnxruntime
|
# via onnxruntime
|
||||||
fonttools==4.45.1
|
fonttools==4.46.0
|
||||||
# via matplotlib
|
# via matplotlib
|
||||||
fsspec==2023.9.1
|
fsspec==2023.9.1
|
||||||
# via
|
# via
|
||||||
@ -134,7 +134,7 @@ pdfminer-six==20221105
|
|||||||
# pdfplumber
|
# pdfplumber
|
||||||
pdfplumber==0.10.3
|
pdfplumber==0.10.3
|
||||||
# via layoutparser
|
# via layoutparser
|
||||||
pikepdf==8.7.1
|
pikepdf==8.8.0
|
||||||
# via -r extra-pdf-image.in
|
# via -r extra-pdf-image.in
|
||||||
pillow==10.0.1
|
pillow==10.0.1
|
||||||
# via
|
# via
|
||||||
@ -250,7 +250,7 @@ typing-extensions==4.8.0
|
|||||||
# torch
|
# torch
|
||||||
tzdata==2023.3
|
tzdata==2023.3
|
||||||
# via pandas
|
# via pandas
|
||||||
unstructured-inference==0.7.17
|
unstructured-inference==0.7.18
|
||||||
# via -r extra-pdf-image.in
|
# via -r extra-pdf-image.in
|
||||||
unstructured-pytesseract==0.3.12
|
unstructured-pytesseract==0.3.12
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -60,7 +60,7 @@ idna==3.6
|
|||||||
# yarl
|
# yarl
|
||||||
isodate==0.6.1
|
isodate==0.6.1
|
||||||
# via azure-storage-blob
|
# via azure-storage-blob
|
||||||
msal==1.25.0
|
msal==1.26.0
|
||||||
# via
|
# via
|
||||||
# azure-datalake-store
|
# azure-datalake-store
|
||||||
# azure-identity
|
# azure-identity
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
#
|
#
|
||||||
# pip-compile --output-file=ingest/delta-table.txt ingest/delta-table.in
|
# pip-compile --output-file=ingest/delta-table.txt ingest/delta-table.in
|
||||||
#
|
#
|
||||||
deltalake==0.13.0
|
deltalake==0.14.0
|
||||||
# via -r ingest/delta-table.in
|
# via -r ingest/delta-table.in
|
||||||
fsspec==2023.9.1
|
fsspec==2023.9.1
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -64,11 +64,11 @@ jsonpatch==1.33
|
|||||||
# langchain-core
|
# langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==2.4
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain==0.0.344
|
langchain==0.0.345
|
||||||
# via -r ingest/embed-aws-bedrock.in
|
# via -r ingest/embed-aws-bedrock.in
|
||||||
langchain-core==0.0.8
|
langchain-core==0.0.9
|
||||||
# via langchain
|
# via langchain
|
||||||
langsmith==0.0.68
|
langsmith==0.0.69
|
||||||
# via
|
# via
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-core
|
# langchain-core
|
||||||
|
|||||||
@ -79,11 +79,11 @@ jsonpatch==1.33
|
|||||||
# langchain-core
|
# langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==2.4
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain==0.0.344
|
langchain==0.0.345
|
||||||
# via -r ingest/embed-huggingface.in
|
# via -r ingest/embed-huggingface.in
|
||||||
langchain-core==0.0.8
|
langchain-core==0.0.9
|
||||||
# via langchain
|
# via langchain
|
||||||
langsmith==0.0.68
|
langsmith==0.0.69
|
||||||
# via
|
# via
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-core
|
# langchain-core
|
||||||
|
|||||||
@ -64,11 +64,11 @@ jsonpatch==1.33
|
|||||||
# langchain-core
|
# langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==2.4
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain==0.0.344
|
langchain==0.0.345
|
||||||
# via -r ingest/embed-openai.in
|
# via -r ingest/embed-openai.in
|
||||||
langchain-core==0.0.8
|
langchain-core==0.0.9
|
||||||
# via langchain
|
# via langchain
|
||||||
langsmith==0.0.68
|
langsmith==0.0.69
|
||||||
# via
|
# via
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-core
|
# langchain-core
|
||||||
@ -125,7 +125,7 @@ tenacity==8.2.3
|
|||||||
# via
|
# via
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-core
|
# langchain-core
|
||||||
tiktoken==0.5.1
|
tiktoken==0.5.2
|
||||||
# via -r ingest/embed-openai.in
|
# via -r ingest/embed-openai.in
|
||||||
tqdm==4.66.1
|
tqdm==4.66.1
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -29,7 +29,7 @@ idna==3.6
|
|||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# requests
|
# requests
|
||||||
msal==1.25.0
|
msal==1.26.0
|
||||||
# via
|
# via
|
||||||
# -r ingest/onedrive.in
|
# -r ingest/onedrive.in
|
||||||
# office365-rest-python-client
|
# office365-rest-python-client
|
||||||
|
|||||||
@ -23,7 +23,7 @@ idna==3.6
|
|||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# requests
|
# requests
|
||||||
msal==1.25.0
|
msal==1.26.0
|
||||||
# via
|
# via
|
||||||
# -r ingest/outlook.in
|
# -r ingest/outlook.in
|
||||||
# office365-rest-python-client
|
# office365-rest-python-client
|
||||||
|
|||||||
@ -33,5 +33,5 @@ urllib3==1.26.18
|
|||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# -c ingest/../constraints.in
|
# -c ingest/../constraints.in
|
||||||
# requests
|
# requests
|
||||||
websocket-client==1.6.4
|
websocket-client==1.7.0
|
||||||
# via praw
|
# via praw
|
||||||
|
|||||||
@ -23,7 +23,7 @@ idna==3.6
|
|||||||
# via
|
# via
|
||||||
# -c ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# requests
|
# requests
|
||||||
msal==1.25.0
|
msal==1.26.0
|
||||||
# via
|
# via
|
||||||
# -r ingest/sharepoint.in
|
# -r ingest/sharepoint.in
|
||||||
# office365-rest-python-client
|
# office365-rest-python-client
|
||||||
|
|||||||
@ -4,5 +4,5 @@
|
|||||||
#
|
#
|
||||||
# pip-compile --output-file=ingest/slack.txt ingest/slack.in
|
# pip-compile --output-file=ingest/slack.txt ingest/slack.in
|
||||||
#
|
#
|
||||||
slack-sdk==3.26.0
|
slack-sdk==3.26.1
|
||||||
# via -r ingest/slack.in
|
# via -r ingest/slack.in
|
||||||
|
|||||||
@ -2,44 +2,41 @@
|
|||||||
# This file is autogenerated by pip-compile with Python 3.8
|
# This file is autogenerated by pip-compile with Python 3.8
|
||||||
# by the following command:
|
# by the following command:
|
||||||
#
|
#
|
||||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest/weaviate.in
|
# pip-compile --output-file=ingest/weaviate.txt ingest/weaviate.in
|
||||||
#
|
#
|
||||||
authlib==1.2.1
|
authlib==1.2.1
|
||||||
# via weaviate-client
|
# via weaviate-client
|
||||||
certifi==2023.11.17
|
certifi==2023.11.17
|
||||||
# via
|
# via
|
||||||
# -c requirements/constraints.in
|
# -c ingest/../base.txt
|
||||||
# -c requirements/ingest/../base.txt
|
# -c ingest/../constraints.in
|
||||||
# -c requirements/ingest/../constraints.in
|
|
||||||
# requests
|
# requests
|
||||||
cffi==1.16.0
|
cffi==1.16.0
|
||||||
# via cryptography
|
# via cryptography
|
||||||
charset-normalizer==3.3.2
|
charset-normalizer==3.3.2
|
||||||
# via
|
# via
|
||||||
# -c requirements/ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# requests
|
# requests
|
||||||
cryptography==41.0.5
|
cryptography==41.0.7
|
||||||
# via authlib
|
# via authlib
|
||||||
idna==3.4
|
idna==3.6
|
||||||
# via
|
# via
|
||||||
# -c requirements/ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# requests
|
# requests
|
||||||
pycparser==2.21
|
pycparser==2.21
|
||||||
# via cffi
|
# via cffi
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
# via
|
# via
|
||||||
# -c requirements/ingest/../base.txt
|
# -c ingest/../base.txt
|
||||||
# weaviate-client
|
# weaviate-client
|
||||||
urllib3==1.26.18
|
urllib3==1.26.18
|
||||||
# via
|
# via
|
||||||
# -c requirements/constraints.in
|
# -c ingest/../base.txt
|
||||||
# -c requirements/ingest/../base.txt
|
# -c ingest/../constraints.in
|
||||||
# -c requirements/ingest/../constraints.in
|
|
||||||
# requests
|
# requests
|
||||||
validators==0.22.0
|
validators==0.22.0
|
||||||
# via weaviate-client
|
# via weaviate-client
|
||||||
weaviate-client==3.25.3
|
weaviate-client==3.25.3
|
||||||
# via
|
# via
|
||||||
# -c requirements/constraints.in
|
# -c ingest/../constraints.in
|
||||||
# -c requirements/ingest/../constraints.in
|
# -r ingest/weaviate.in
|
||||||
# -r requirements/ingest/weaviate.in
|
|
||||||
|
|||||||
@ -36,7 +36,7 @@ flake8==6.1.0
|
|||||||
# flake8-print
|
# flake8-print
|
||||||
flake8-print==5.0.0
|
flake8-print==5.0.0
|
||||||
# via -r test.in
|
# via -r test.in
|
||||||
freezegun==1.2.2
|
freezegun==1.3.1
|
||||||
# via -r test.in
|
# via -r test.in
|
||||||
grpcio==1.59.3
|
grpcio==1.59.3
|
||||||
# via -r test.in
|
# via -r test.in
|
||||||
@ -111,7 +111,7 @@ requests==2.31.0
|
|||||||
# via
|
# via
|
||||||
# -c base.txt
|
# -c base.txt
|
||||||
# label-studio-sdk
|
# label-studio-sdk
|
||||||
ruff==0.1.6
|
ruff==0.1.7
|
||||||
# via -r test.in
|
# via -r test.in
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
# via
|
# via
|
||||||
|
|||||||
115
test_unstructured/partition/pdf_image/test_pdf_image_utils.py
Normal file
115
test_unstructured/partition/pdf_image/test_pdf_image_utils.py
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
from PIL import Image as PILImg
|
||||||
|
|
||||||
|
from test_unstructured.unit_utils import example_doc_path
|
||||||
|
from unstructured.documents.coordinates import PixelSpace
|
||||||
|
from unstructured.documents.elements import ElementMetadata, Image
|
||||||
|
from unstructured.partition.pdf_image import pdf_image_utils
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("image_type", ["pil", "numpy_array"])
|
||||||
|
def test_write_image(image_type):
|
||||||
|
mock_pil_image = PILImg.new("RGB", (50, 50))
|
||||||
|
mock_numpy_image = np.zeros((50, 50, 3), np.uint8)
|
||||||
|
|
||||||
|
image_map = {
|
||||||
|
"pil": mock_pil_image,
|
||||||
|
"numpy_array": mock_numpy_image,
|
||||||
|
}
|
||||||
|
image = image_map[image_type]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_image_path = os.path.join(tmpdir, "test_image.jpg")
|
||||||
|
pdf_image_utils.write_image(image, output_image_path)
|
||||||
|
assert os.path.exists(output_image_path)
|
||||||
|
|
||||||
|
# Additional check to see if the written image can be read
|
||||||
|
read_image = PILImg.open(output_image_path)
|
||||||
|
assert read_image is not None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
||||||
|
@pytest.mark.parametrize("path_only", [True, False])
|
||||||
|
def test_convert_pdf_to_image(
|
||||||
|
file_mode, path_only, filename=example_doc_path("embedded-images.pdf")
|
||||||
|
):
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
if file_mode == "filename":
|
||||||
|
images = pdf_image_utils.convert_pdf_to_image(
|
||||||
|
filename=filename,
|
||||||
|
file=None,
|
||||||
|
output_folder=tmpdir,
|
||||||
|
path_only=path_only,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
images = pdf_image_utils.convert_pdf_to_image(
|
||||||
|
filename="",
|
||||||
|
file=f,
|
||||||
|
output_folder=tmpdir,
|
||||||
|
path_only=path_only,
|
||||||
|
)
|
||||||
|
|
||||||
|
if path_only:
|
||||||
|
assert isinstance(images[0], str)
|
||||||
|
else:
|
||||||
|
assert isinstance(images[0], PILImg.Image)
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_images_from_elements(filename=example_doc_path("embedded-images.pdf")):
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
elements = [
|
||||||
|
Image(
|
||||||
|
text="3",
|
||||||
|
coordinates=(
|
||||||
|
(78.7401411111111, 86.61545694444455),
|
||||||
|
(78.7401411111111, 519.9487805555556),
|
||||||
|
(512.0734647222223, 519.9487805555556),
|
||||||
|
(512.0734647222223, 86.61545694444455),
|
||||||
|
),
|
||||||
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||||
|
metadata=ElementMetadata(page_number=1),
|
||||||
|
),
|
||||||
|
Image(
|
||||||
|
text="4",
|
||||||
|
coordinates=(
|
||||||
|
(570.8661397222222, 86.6154566666667),
|
||||||
|
(570.8661397222222, 519.6862825000001),
|
||||||
|
(1003.9369655555556, 519.6862825000001),
|
||||||
|
(1003.9369655555556, 86.6154566666667),
|
||||||
|
),
|
||||||
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||||
|
metadata=ElementMetadata(page_number=1),
|
||||||
|
),
|
||||||
|
Image(
|
||||||
|
text="5",
|
||||||
|
coordinates=(
|
||||||
|
(1062.9921808333331, 86.61545694444455),
|
||||||
|
(1062.9921808333331, 519.9487805555556),
|
||||||
|
(1496.3255044444445, 519.9487805555556),
|
||||||
|
(1496.3255044444445, 86.61545694444455),
|
||||||
|
),
|
||||||
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||||
|
metadata=ElementMetadata(page_number=1),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
pdf_image_utils.extract_images_from_elements(
|
||||||
|
elements=elements, pdf_image_dpi=200, filename=filename, output_dir_path=str(tmpdir)
|
||||||
|
)
|
||||||
|
|
||||||
|
for i, el in enumerate(elements):
|
||||||
|
expected_image_path = os.path.join(
|
||||||
|
str(tmpdir), f"figure-{el.metadata.page_number}-{i + 1}.jpg"
|
||||||
|
)
|
||||||
|
assert os.path.isfile(el.metadata.image_path)
|
||||||
|
assert el.metadata.image_path == expected_image_path
|
||||||
|
|
||||||
|
|
||||||
|
def test_write_image_raises_error():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
pdf_image_utils.write_image("invalid_type", "test_image.jpg")
|
||||||
@ -555,6 +555,44 @@ def _add_regex_metadata(
|
|||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
class ElementType:
|
||||||
|
TITLE = "Title"
|
||||||
|
TEXT = "Text"
|
||||||
|
UNCATEGORIZED_TEXT = "UncategorizedText"
|
||||||
|
NARRATIVE_TEXT = "NarrativeText"
|
||||||
|
BULLETED_TEXT = "BulletedText"
|
||||||
|
ABSTRACT = "Abstract"
|
||||||
|
THREADING = "Threading"
|
||||||
|
FORM = "Form"
|
||||||
|
FIELD_NAME = "Field-Name"
|
||||||
|
VALUE = "Value"
|
||||||
|
LINK = "Link"
|
||||||
|
COMPOSITE_ELEMENT = "CompositeElement"
|
||||||
|
IMAGE = "Image"
|
||||||
|
PICTURE = "Picture"
|
||||||
|
FIGURE_CAPTION = "FigureCaption"
|
||||||
|
FIGURE = "Figure"
|
||||||
|
CAPTION = "Caption"
|
||||||
|
LIST = "List"
|
||||||
|
LIST_ITEM = "ListItem"
|
||||||
|
LIST_ITEM_OTHER = "List-item"
|
||||||
|
CHECKED = "Checked"
|
||||||
|
UNCHECKED = "Unchecked"
|
||||||
|
ADDRESS = "Address"
|
||||||
|
EMAIL_ADDRESS = "EmailAddress"
|
||||||
|
PAGE_BREAK = "PageBreak"
|
||||||
|
FORMULA = "Formula"
|
||||||
|
TABLE = "Table"
|
||||||
|
HEADER = "Header"
|
||||||
|
HEADLINE = "Headline"
|
||||||
|
SUB_HEADLINE = "Subheadline"
|
||||||
|
PAGE_HEADER = "Page-header" # Title?
|
||||||
|
SECTION_HEADER = "Section-header"
|
||||||
|
FOOTER = "Footer"
|
||||||
|
FOOTNOTE = "Footnote"
|
||||||
|
PAGE_FOOTER = "Page-footer"
|
||||||
|
|
||||||
|
|
||||||
class Element(abc.ABC):
|
class Element(abc.ABC):
|
||||||
"""An element is a section of a page in the document."""
|
"""An element is a section of a page in the document."""
|
||||||
|
|
||||||
@ -764,7 +802,7 @@ class EmailAddress(Text):
|
|||||||
class Image(Text):
|
class Image(Text):
|
||||||
"""A text element for capturing image metadata."""
|
"""A text element for capturing image metadata."""
|
||||||
|
|
||||||
category = "Image"
|
category = ElementType.IMAGE
|
||||||
|
|
||||||
|
|
||||||
class PageBreak(Text):
|
class PageBreak(Text):
|
||||||
@ -797,44 +835,6 @@ class Footer(Text):
|
|||||||
category = "Footer"
|
category = "Footer"
|
||||||
|
|
||||||
|
|
||||||
class ElementType:
|
|
||||||
TITLE = "Title"
|
|
||||||
TEXT = "Text"
|
|
||||||
UNCATEGORIZED_TEXT = "UncategorizedText"
|
|
||||||
NARRATIVE_TEXT = "NarrativeText"
|
|
||||||
BULLETED_TEXT = "BulletedText"
|
|
||||||
ABSTRACT = "Abstract"
|
|
||||||
THREADING = "Threading"
|
|
||||||
FORM = "Form"
|
|
||||||
FIELD_NAME = "Field-Name"
|
|
||||||
VALUE = "Value"
|
|
||||||
LINK = "Link"
|
|
||||||
COMPOSITE_ELEMENT = "CompositeElement"
|
|
||||||
IMAGE = "Image"
|
|
||||||
PICTURE = "Picture"
|
|
||||||
FIGURE_CAPTION = "FigureCaption"
|
|
||||||
FIGURE = "Figure"
|
|
||||||
CAPTION = "Caption"
|
|
||||||
LIST = "List"
|
|
||||||
LIST_ITEM = "ListItem"
|
|
||||||
LIST_ITEM_OTHER = "List-item"
|
|
||||||
CHECKED = "Checked"
|
|
||||||
UNCHECKED = "Unchecked"
|
|
||||||
ADDRESS = "Address"
|
|
||||||
EMAIL_ADDRESS = "EmailAddress"
|
|
||||||
PAGE_BREAK = "PageBreak"
|
|
||||||
FORMULA = "Formula"
|
|
||||||
TABLE = "Table"
|
|
||||||
HEADER = "Header"
|
|
||||||
HEADLINE = "Headline"
|
|
||||||
SUB_HEADLINE = "Subheadline"
|
|
||||||
PAGE_HEADER = "Page-header" # Title?
|
|
||||||
SECTION_HEADER = "Section-header"
|
|
||||||
FOOTER = "Footer"
|
|
||||||
FOOTNOTE = "Footnote"
|
|
||||||
PAGE_FOOTER = "Page-footer"
|
|
||||||
|
|
||||||
|
|
||||||
TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
|
TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
|
||||||
ElementType.TITLE: Title,
|
ElementType.TITLE: Title,
|
||||||
ElementType.SECTION_HEADER: Title,
|
ElementType.SECTION_HEADER: Title,
|
||||||
|
|||||||
@ -70,6 +70,7 @@ from unstructured.partition.lang import (
|
|||||||
check_languages,
|
check_languages,
|
||||||
prepare_languages_for_tesseract,
|
prepare_languages_for_tesseract,
|
||||||
)
|
)
|
||||||
|
from unstructured.partition.pdf_image.pdf_image_utils import extract_images_from_elements
|
||||||
from unstructured.partition.pdf_image.pdfminer_utils import (
|
from unstructured.partition.pdf_image.pdfminer_utils import (
|
||||||
open_pdfminer_pages_generator,
|
open_pdfminer_pages_generator,
|
||||||
rect_to_bbox,
|
rect_to_bbox,
|
||||||
@ -381,8 +382,6 @@ def _partition_pdf_or_image_local(
|
|||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
model_name=model_name,
|
model_name=model_name,
|
||||||
pdf_image_dpi=pdf_image_dpi,
|
pdf_image_dpi=pdf_image_dpi,
|
||||||
extract_images_in_pdf=extract_images_in_pdf,
|
|
||||||
image_output_dir_path=image_output_dir_path,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||||
@ -411,8 +410,6 @@ def _partition_pdf_or_image_local(
|
|||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
model_name=model_name,
|
model_name=model_name,
|
||||||
pdf_image_dpi=pdf_image_dpi,
|
pdf_image_dpi=pdf_image_dpi,
|
||||||
extract_images_in_pdf=extract_images_in_pdf,
|
|
||||||
image_output_dir_path=image_output_dir_path,
|
|
||||||
)
|
)
|
||||||
if hasattr(file, "seek"):
|
if hasattr(file, "seek"):
|
||||||
file.seek(0)
|
file.seek(0)
|
||||||
@ -458,6 +455,15 @@ def _partition_pdf_or_image_local(
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if extract_images_in_pdf:
|
||||||
|
extract_images_from_elements(
|
||||||
|
elements=elements,
|
||||||
|
filename=filename,
|
||||||
|
file=file,
|
||||||
|
pdf_image_dpi=pdf_image_dpi,
|
||||||
|
output_dir_path=image_output_dir_path,
|
||||||
|
)
|
||||||
|
|
||||||
out_elements = []
|
out_elements = []
|
||||||
for el in elements:
|
for el in elements:
|
||||||
if isinstance(el, PageBreak) and not include_page_breaks:
|
if isinstance(el, PageBreak) and not include_page_breaks:
|
||||||
|
|||||||
124
unstructured/partition/pdf_image/pdf_image_utils.py
Normal file
124
unstructured/partition/pdf_image/pdf_image_utils.py
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from pathlib import PurePath
|
||||||
|
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import pdf2image
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from unstructured.documents.elements import ElementType
|
||||||
|
from unstructured.logger import logger
|
||||||
|
from unstructured.partition.common import convert_to_bytes
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from unstructured.documents.elements import Element
|
||||||
|
|
||||||
|
|
||||||
|
def write_image(image: Union[Image.Image, np.ndarray], output_image_path: str):
|
||||||
|
"""
|
||||||
|
Write an image to a specified file path, supporting both PIL Image and numpy ndarray formats.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- image (Union[Image.Image, np.ndarray]): The image to be written, which can be in PIL Image
|
||||||
|
format or a numpy ndarray format.
|
||||||
|
- output_image_path (str): The path to which the image will be written.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
- ValueError: If the provided image type is neither PIL Image nor numpy ndarray.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- None: The function writes the image to the specified path but does not return any value.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if isinstance(image, Image.Image):
|
||||||
|
image.save(output_image_path)
|
||||||
|
elif isinstance(image, np.ndarray):
|
||||||
|
cv2.imwrite(output_image_path, image)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unsupported Image Type")
|
||||||
|
|
||||||
|
|
||||||
|
def convert_pdf_to_image(
|
||||||
|
filename: str,
|
||||||
|
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||||
|
dpi: int = 200,
|
||||||
|
output_folder: Optional[Union[str, PurePath]] = None,
|
||||||
|
path_only: bool = False,
|
||||||
|
) -> Union[List[Image.Image], List[str]]:
|
||||||
|
"""Get the image renderings of the pdf pages using pdf2image"""
|
||||||
|
|
||||||
|
if path_only and not output_folder:
|
||||||
|
raise ValueError("output_folder must be specified if path_only is true")
|
||||||
|
|
||||||
|
if file is not None:
|
||||||
|
f_bytes = convert_to_bytes(file)
|
||||||
|
images = pdf2image.convert_from_bytes(
|
||||||
|
f_bytes,
|
||||||
|
dpi=dpi,
|
||||||
|
output_folder=output_folder,
|
||||||
|
paths_only=path_only,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
images = pdf2image.convert_from_path(
|
||||||
|
filename,
|
||||||
|
dpi=dpi,
|
||||||
|
output_folder=output_folder,
|
||||||
|
paths_only=path_only,
|
||||||
|
)
|
||||||
|
|
||||||
|
return images
|
||||||
|
|
||||||
|
|
||||||
|
def extract_images_from_elements(
|
||||||
|
elements: List["Element"],
|
||||||
|
pdf_image_dpi: int,
|
||||||
|
filename: str = "",
|
||||||
|
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||||
|
output_dir_path: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Extract and save images from the page. This method iterates through the layout elements
|
||||||
|
of the page, identifies image regions, and extracts and saves them as separate image files.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not output_dir_path:
|
||||||
|
output_dir_path = os.path.join(os.getcwd(), "figures")
|
||||||
|
os.makedirs(output_dir_path, exist_ok=True)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
_image_paths = convert_pdf_to_image(
|
||||||
|
filename,
|
||||||
|
file,
|
||||||
|
pdf_image_dpi,
|
||||||
|
output_folder=temp_dir,
|
||||||
|
path_only=True,
|
||||||
|
)
|
||||||
|
image_paths = cast(List[str], _image_paths)
|
||||||
|
|
||||||
|
figure_number = 0
|
||||||
|
for el in elements:
|
||||||
|
coordinates = el.metadata.coordinates
|
||||||
|
if not coordinates or not coordinates.points or el.category != ElementType.IMAGE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
points = coordinates.points
|
||||||
|
x1, y1 = points[0]
|
||||||
|
x2, y2 = points[2]
|
||||||
|
page_number = el.metadata.page_number
|
||||||
|
|
||||||
|
figure_number += 1
|
||||||
|
try:
|
||||||
|
output_f_path = os.path.join(
|
||||||
|
output_dir_path,
|
||||||
|
f"figure-{page_number}-{figure_number}.jpg",
|
||||||
|
)
|
||||||
|
image_path = image_paths[page_number - 1]
|
||||||
|
image = Image.open(image_path)
|
||||||
|
cropped_image = image.crop((x1, y1, x2, y2))
|
||||||
|
write_image(cropped_image, output_f_path)
|
||||||
|
# add image path to element metadata
|
||||||
|
el.metadata.image_path = output_f_path
|
||||||
|
except (ValueError, IOError):
|
||||||
|
logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)
|
||||||
Loading…
x
Reference in New Issue
Block a user