mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-03 11:34:07 +00:00
Refactor: support image extraction (#2201)
### Summary This PR is the second part of the "image extraction" refactor to move it from unstructured-inference repo to unstructured repo, the first part is done in https://github.com/Unstructured-IO/unstructured-inference/pull/299. This PR adds logic to support extracting images. ### Testing `git clone -b refactor/remove_image_extraction_code --single-branch https://github.com/Unstructured-IO/unstructured-inference.git && cd unstructured-inference && pip install -e . && cd ../` ``` elements = partition_pdf( filename="example-docs/embedded-images.pdf", strategy="hi_res", extract_images_in_pdf=True, ) print("\n\n".join([str(el) for el in elements])) ```
This commit is contained in:
parent
c5cb216ac8
commit
ed76b11b1a
@ -2,6 +2,7 @@
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Refactor image extraction code.** The image extraction code is moved from `unstructured-inference` to `unstructured`.
|
||||
* **Refactor pdfminer code.** The pdfminer code is moved from `unstructured-inference` to `unstructured`.
|
||||
|
||||
### Features
|
||||
|
||||
@ -36,7 +36,7 @@ idna==3.6
|
||||
# requests
|
||||
imagesize==1.4.1
|
||||
# via sphinx
|
||||
importlib-metadata==6.9.0
|
||||
importlib-metadata==7.0.0
|
||||
# via sphinx
|
||||
jinja2==3.1.2
|
||||
# via
|
||||
|
||||
@ -36,7 +36,7 @@ idna==3.6
|
||||
# requests
|
||||
imagesize==1.4.1
|
||||
# via sphinx
|
||||
importlib-metadata==6.9.0
|
||||
importlib-metadata==7.0.0
|
||||
# via sphinx
|
||||
jinja2==3.1.2
|
||||
# via
|
||||
|
||||
@ -91,7 +91,7 @@ idna==3.6
|
||||
# anyio
|
||||
# jsonschema
|
||||
# requests
|
||||
importlib-metadata==6.9.0
|
||||
importlib-metadata==7.0.0
|
||||
# via
|
||||
# build
|
||||
# jupyter-client
|
||||
@ -167,7 +167,7 @@ jupyter-events==0.9.0
|
||||
# via jupyter-server
|
||||
jupyter-lsp==2.2.1
|
||||
# via jupyterlab
|
||||
jupyter-server==2.11.1
|
||||
jupyter-server==2.11.2
|
||||
# via
|
||||
# jupyter-lsp
|
||||
# jupyterlab
|
||||
@ -198,7 +198,7 @@ mistune==3.0.2
|
||||
# via nbconvert
|
||||
nbclient==0.9.0
|
||||
# via nbconvert
|
||||
nbconvert==7.11.0
|
||||
nbconvert==7.12.0
|
||||
# via
|
||||
# jupyter
|
||||
# jupyter-server
|
||||
@ -290,7 +290,7 @@ pyyaml==6.0.1
|
||||
# -c test.txt
|
||||
# jupyter-events
|
||||
# pre-commit
|
||||
pyzmq==25.1.1
|
||||
pyzmq==25.1.2
|
||||
# via
|
||||
# ipykernel
|
||||
# jupyter-client
|
||||
@ -405,7 +405,7 @@ webencodings==0.5.1
|
||||
# via
|
||||
# bleach
|
||||
# tinycss2
|
||||
websocket-client==1.6.4
|
||||
websocket-client==1.7.0
|
||||
# via jupyter-server
|
||||
wheel==0.42.0
|
||||
# via
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile --output-file=extra-markdown.txt extra-markdown.in
|
||||
#
|
||||
importlib-metadata==6.9.0
|
||||
importlib-metadata==7.0.0
|
||||
# via markdown
|
||||
markdown==3.5.1
|
||||
# via -r extra-markdown.in
|
||||
|
||||
@ -45,7 +45,7 @@ flask==3.0.0
|
||||
# visualdl
|
||||
flask-babel==4.0.0
|
||||
# via visualdl
|
||||
fonttools==4.45.1
|
||||
fonttools==4.46.0
|
||||
# via matplotlib
|
||||
future==0.18.3
|
||||
# via bce-python-sdk
|
||||
@ -59,7 +59,7 @@ imageio==2.33.0
|
||||
# scikit-image
|
||||
imgaug==0.4.0
|
||||
# via unstructured-paddleocr
|
||||
importlib-metadata==6.9.0
|
||||
importlib-metadata==7.0.0
|
||||
# via flask
|
||||
importlib-resources==6.1.1
|
||||
# via matplotlib
|
||||
|
||||
@ -8,7 +8,7 @@ pikepdf
|
||||
pypdf
|
||||
# Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
|
||||
# when unstructured library is.
|
||||
unstructured-inference==0.7.17
|
||||
unstructured-inference==0.7.18
|
||||
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
|
||||
# from one tesseract call
|
||||
unstructured.pytesseract>=0.3.12
|
||||
|
||||
@ -37,7 +37,7 @@ filelock==3.13.1
|
||||
# transformers
|
||||
flatbuffers==23.5.26
|
||||
# via onnxruntime
|
||||
fonttools==4.45.1
|
||||
fonttools==4.46.0
|
||||
# via matplotlib
|
||||
fsspec==2023.9.1
|
||||
# via
|
||||
@ -134,7 +134,7 @@ pdfminer-six==20221105
|
||||
# pdfplumber
|
||||
pdfplumber==0.10.3
|
||||
# via layoutparser
|
||||
pikepdf==8.7.1
|
||||
pikepdf==8.8.0
|
||||
# via -r extra-pdf-image.in
|
||||
pillow==10.0.1
|
||||
# via
|
||||
@ -250,7 +250,7 @@ typing-extensions==4.8.0
|
||||
# torch
|
||||
tzdata==2023.3
|
||||
# via pandas
|
||||
unstructured-inference==0.7.17
|
||||
unstructured-inference==0.7.18
|
||||
# via -r extra-pdf-image.in
|
||||
unstructured-pytesseract==0.3.12
|
||||
# via
|
||||
|
||||
@ -60,7 +60,7 @@ idna==3.6
|
||||
# yarl
|
||||
isodate==0.6.1
|
||||
# via azure-storage-blob
|
||||
msal==1.25.0
|
||||
msal==1.26.0
|
||||
# via
|
||||
# azure-datalake-store
|
||||
# azure-identity
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile --output-file=ingest/delta-table.txt ingest/delta-table.in
|
||||
#
|
||||
deltalake==0.13.0
|
||||
deltalake==0.14.0
|
||||
# via -r ingest/delta-table.in
|
||||
fsspec==2023.9.1
|
||||
# via
|
||||
|
||||
@ -64,11 +64,11 @@ jsonpatch==1.33
|
||||
# langchain-core
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.0.344
|
||||
langchain==0.0.345
|
||||
# via -r ingest/embed-aws-bedrock.in
|
||||
langchain-core==0.0.8
|
||||
langchain-core==0.0.9
|
||||
# via langchain
|
||||
langsmith==0.0.68
|
||||
langsmith==0.0.69
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
|
||||
@ -79,11 +79,11 @@ jsonpatch==1.33
|
||||
# langchain-core
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.0.344
|
||||
langchain==0.0.345
|
||||
# via -r ingest/embed-huggingface.in
|
||||
langchain-core==0.0.8
|
||||
langchain-core==0.0.9
|
||||
# via langchain
|
||||
langsmith==0.0.68
|
||||
langsmith==0.0.69
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
|
||||
@ -64,11 +64,11 @@ jsonpatch==1.33
|
||||
# langchain-core
|
||||
jsonpointer==2.4
|
||||
# via jsonpatch
|
||||
langchain==0.0.344
|
||||
langchain==0.0.345
|
||||
# via -r ingest/embed-openai.in
|
||||
langchain-core==0.0.8
|
||||
langchain-core==0.0.9
|
||||
# via langchain
|
||||
langsmith==0.0.68
|
||||
langsmith==0.0.69
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
@ -125,7 +125,7 @@ tenacity==8.2.3
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
tiktoken==0.5.1
|
||||
tiktoken==0.5.2
|
||||
# via -r ingest/embed-openai.in
|
||||
tqdm==4.66.1
|
||||
# via
|
||||
|
||||
@ -29,7 +29,7 @@ idna==3.6
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# requests
|
||||
msal==1.25.0
|
||||
msal==1.26.0
|
||||
# via
|
||||
# -r ingest/onedrive.in
|
||||
# office365-rest-python-client
|
||||
|
||||
@ -23,7 +23,7 @@ idna==3.6
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# requests
|
||||
msal==1.25.0
|
||||
msal==1.26.0
|
||||
# via
|
||||
# -r ingest/outlook.in
|
||||
# office365-rest-python-client
|
||||
|
||||
@ -33,5 +33,5 @@ urllib3==1.26.18
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# requests
|
||||
websocket-client==1.6.4
|
||||
websocket-client==1.7.0
|
||||
# via praw
|
||||
|
||||
@ -23,7 +23,7 @@ idna==3.6
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# requests
|
||||
msal==1.25.0
|
||||
msal==1.26.0
|
||||
# via
|
||||
# -r ingest/sharepoint.in
|
||||
# office365-rest-python-client
|
||||
|
||||
@ -4,5 +4,5 @@
|
||||
#
|
||||
# pip-compile --output-file=ingest/slack.txt ingest/slack.in
|
||||
#
|
||||
slack-sdk==3.26.0
|
||||
slack-sdk==3.26.1
|
||||
# via -r ingest/slack.in
|
||||
|
||||
@ -2,44 +2,41 @@
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --constraint=requirements/constraints.in requirements/ingest/weaviate.in
|
||||
# pip-compile --output-file=ingest/weaviate.txt ingest/weaviate.in
|
||||
#
|
||||
authlib==1.2.1
|
||||
# via weaviate-client
|
||||
certifi==2023.11.17
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# -c requirements/ingest/../base.txt
|
||||
# -c requirements/ingest/../constraints.in
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# requests
|
||||
cffi==1.16.0
|
||||
# via cryptography
|
||||
charset-normalizer==3.3.2
|
||||
# via
|
||||
# -c requirements/ingest/../base.txt
|
||||
# -c ingest/../base.txt
|
||||
# requests
|
||||
cryptography==41.0.5
|
||||
cryptography==41.0.7
|
||||
# via authlib
|
||||
idna==3.4
|
||||
idna==3.6
|
||||
# via
|
||||
# -c requirements/ingest/../base.txt
|
||||
# -c ingest/../base.txt
|
||||
# requests
|
||||
pycparser==2.21
|
||||
# via cffi
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c requirements/ingest/../base.txt
|
||||
# -c ingest/../base.txt
|
||||
# weaviate-client
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# -c requirements/ingest/../base.txt
|
||||
# -c requirements/ingest/../constraints.in
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# requests
|
||||
validators==0.22.0
|
||||
# via weaviate-client
|
||||
weaviate-client==3.25.3
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# -c requirements/ingest/../constraints.in
|
||||
# -r requirements/ingest/weaviate.in
|
||||
# -c ingest/../constraints.in
|
||||
# -r ingest/weaviate.in
|
||||
|
||||
@ -36,7 +36,7 @@ flake8==6.1.0
|
||||
# flake8-print
|
||||
flake8-print==5.0.0
|
||||
# via -r test.in
|
||||
freezegun==1.2.2
|
||||
freezegun==1.3.1
|
||||
# via -r test.in
|
||||
grpcio==1.59.3
|
||||
# via -r test.in
|
||||
@ -111,7 +111,7 @@ requests==2.31.0
|
||||
# via
|
||||
# -c base.txt
|
||||
# label-studio-sdk
|
||||
ruff==0.1.6
|
||||
ruff==0.1.7
|
||||
# via -r test.in
|
||||
six==1.16.0
|
||||
# via
|
||||
|
||||
115
test_unstructured/partition/pdf_image/test_pdf_image_utils.py
Normal file
115
test_unstructured/partition/pdf_image/test_pdf_image_utils.py
Normal file
@ -0,0 +1,115 @@
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from PIL import Image as PILImg
|
||||
|
||||
from test_unstructured.unit_utils import example_doc_path
|
||||
from unstructured.documents.coordinates import PixelSpace
|
||||
from unstructured.documents.elements import ElementMetadata, Image
|
||||
from unstructured.partition.pdf_image import pdf_image_utils
|
||||
|
||||
|
||||
@pytest.mark.parametrize("image_type", ["pil", "numpy_array"])
|
||||
def test_write_image(image_type):
|
||||
mock_pil_image = PILImg.new("RGB", (50, 50))
|
||||
mock_numpy_image = np.zeros((50, 50, 3), np.uint8)
|
||||
|
||||
image_map = {
|
||||
"pil": mock_pil_image,
|
||||
"numpy_array": mock_numpy_image,
|
||||
}
|
||||
image = image_map[image_type]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_image_path = os.path.join(tmpdir, "test_image.jpg")
|
||||
pdf_image_utils.write_image(image, output_image_path)
|
||||
assert os.path.exists(output_image_path)
|
||||
|
||||
# Additional check to see if the written image can be read
|
||||
read_image = PILImg.open(output_image_path)
|
||||
assert read_image is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
||||
@pytest.mark.parametrize("path_only", [True, False])
|
||||
def test_convert_pdf_to_image(
|
||||
file_mode, path_only, filename=example_doc_path("embedded-images.pdf")
|
||||
):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
if file_mode == "filename":
|
||||
images = pdf_image_utils.convert_pdf_to_image(
|
||||
filename=filename,
|
||||
file=None,
|
||||
output_folder=tmpdir,
|
||||
path_only=path_only,
|
||||
)
|
||||
else:
|
||||
with open(filename, "rb") as f:
|
||||
images = pdf_image_utils.convert_pdf_to_image(
|
||||
filename="",
|
||||
file=f,
|
||||
output_folder=tmpdir,
|
||||
path_only=path_only,
|
||||
)
|
||||
|
||||
if path_only:
|
||||
assert isinstance(images[0], str)
|
||||
else:
|
||||
assert isinstance(images[0], PILImg.Image)
|
||||
|
||||
|
||||
def test_extract_images_from_elements(filename=example_doc_path("embedded-images.pdf")):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
elements = [
|
||||
Image(
|
||||
text="3",
|
||||
coordinates=(
|
||||
(78.7401411111111, 86.61545694444455),
|
||||
(78.7401411111111, 519.9487805555556),
|
||||
(512.0734647222223, 519.9487805555556),
|
||||
(512.0734647222223, 86.61545694444455),
|
||||
),
|
||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||
metadata=ElementMetadata(page_number=1),
|
||||
),
|
||||
Image(
|
||||
text="4",
|
||||
coordinates=(
|
||||
(570.8661397222222, 86.6154566666667),
|
||||
(570.8661397222222, 519.6862825000001),
|
||||
(1003.9369655555556, 519.6862825000001),
|
||||
(1003.9369655555556, 86.6154566666667),
|
||||
),
|
||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||
metadata=ElementMetadata(page_number=1),
|
||||
),
|
||||
Image(
|
||||
text="5",
|
||||
coordinates=(
|
||||
(1062.9921808333331, 86.61545694444455),
|
||||
(1062.9921808333331, 519.9487805555556),
|
||||
(1496.3255044444445, 519.9487805555556),
|
||||
(1496.3255044444445, 86.61545694444455),
|
||||
),
|
||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||
metadata=ElementMetadata(page_number=1),
|
||||
),
|
||||
]
|
||||
|
||||
pdf_image_utils.extract_images_from_elements(
|
||||
elements=elements, pdf_image_dpi=200, filename=filename, output_dir_path=str(tmpdir)
|
||||
)
|
||||
|
||||
for i, el in enumerate(elements):
|
||||
expected_image_path = os.path.join(
|
||||
str(tmpdir), f"figure-{el.metadata.page_number}-{i + 1}.jpg"
|
||||
)
|
||||
assert os.path.isfile(el.metadata.image_path)
|
||||
assert el.metadata.image_path == expected_image_path
|
||||
|
||||
|
||||
def test_write_image_raises_error():
|
||||
with pytest.raises(ValueError):
|
||||
pdf_image_utils.write_image("invalid_type", "test_image.jpg")
|
||||
@ -555,6 +555,44 @@ def _add_regex_metadata(
|
||||
return elements
|
||||
|
||||
|
||||
class ElementType:
|
||||
TITLE = "Title"
|
||||
TEXT = "Text"
|
||||
UNCATEGORIZED_TEXT = "UncategorizedText"
|
||||
NARRATIVE_TEXT = "NarrativeText"
|
||||
BULLETED_TEXT = "BulletedText"
|
||||
ABSTRACT = "Abstract"
|
||||
THREADING = "Threading"
|
||||
FORM = "Form"
|
||||
FIELD_NAME = "Field-Name"
|
||||
VALUE = "Value"
|
||||
LINK = "Link"
|
||||
COMPOSITE_ELEMENT = "CompositeElement"
|
||||
IMAGE = "Image"
|
||||
PICTURE = "Picture"
|
||||
FIGURE_CAPTION = "FigureCaption"
|
||||
FIGURE = "Figure"
|
||||
CAPTION = "Caption"
|
||||
LIST = "List"
|
||||
LIST_ITEM = "ListItem"
|
||||
LIST_ITEM_OTHER = "List-item"
|
||||
CHECKED = "Checked"
|
||||
UNCHECKED = "Unchecked"
|
||||
ADDRESS = "Address"
|
||||
EMAIL_ADDRESS = "EmailAddress"
|
||||
PAGE_BREAK = "PageBreak"
|
||||
FORMULA = "Formula"
|
||||
TABLE = "Table"
|
||||
HEADER = "Header"
|
||||
HEADLINE = "Headline"
|
||||
SUB_HEADLINE = "Subheadline"
|
||||
PAGE_HEADER = "Page-header" # Title?
|
||||
SECTION_HEADER = "Section-header"
|
||||
FOOTER = "Footer"
|
||||
FOOTNOTE = "Footnote"
|
||||
PAGE_FOOTER = "Page-footer"
|
||||
|
||||
|
||||
class Element(abc.ABC):
|
||||
"""An element is a section of a page in the document."""
|
||||
|
||||
@ -764,7 +802,7 @@ class EmailAddress(Text):
|
||||
class Image(Text):
|
||||
"""A text element for capturing image metadata."""
|
||||
|
||||
category = "Image"
|
||||
category = ElementType.IMAGE
|
||||
|
||||
|
||||
class PageBreak(Text):
|
||||
@ -797,44 +835,6 @@ class Footer(Text):
|
||||
category = "Footer"
|
||||
|
||||
|
||||
class ElementType:
|
||||
TITLE = "Title"
|
||||
TEXT = "Text"
|
||||
UNCATEGORIZED_TEXT = "UncategorizedText"
|
||||
NARRATIVE_TEXT = "NarrativeText"
|
||||
BULLETED_TEXT = "BulletedText"
|
||||
ABSTRACT = "Abstract"
|
||||
THREADING = "Threading"
|
||||
FORM = "Form"
|
||||
FIELD_NAME = "Field-Name"
|
||||
VALUE = "Value"
|
||||
LINK = "Link"
|
||||
COMPOSITE_ELEMENT = "CompositeElement"
|
||||
IMAGE = "Image"
|
||||
PICTURE = "Picture"
|
||||
FIGURE_CAPTION = "FigureCaption"
|
||||
FIGURE = "Figure"
|
||||
CAPTION = "Caption"
|
||||
LIST = "List"
|
||||
LIST_ITEM = "ListItem"
|
||||
LIST_ITEM_OTHER = "List-item"
|
||||
CHECKED = "Checked"
|
||||
UNCHECKED = "Unchecked"
|
||||
ADDRESS = "Address"
|
||||
EMAIL_ADDRESS = "EmailAddress"
|
||||
PAGE_BREAK = "PageBreak"
|
||||
FORMULA = "Formula"
|
||||
TABLE = "Table"
|
||||
HEADER = "Header"
|
||||
HEADLINE = "Headline"
|
||||
SUB_HEADLINE = "Subheadline"
|
||||
PAGE_HEADER = "Page-header" # Title?
|
||||
SECTION_HEADER = "Section-header"
|
||||
FOOTER = "Footer"
|
||||
FOOTNOTE = "Footnote"
|
||||
PAGE_FOOTER = "Page-footer"
|
||||
|
||||
|
||||
TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
|
||||
ElementType.TITLE: Title,
|
||||
ElementType.SECTION_HEADER: Title,
|
||||
|
||||
@ -70,6 +70,7 @@ from unstructured.partition.lang import (
|
||||
check_languages,
|
||||
prepare_languages_for_tesseract,
|
||||
)
|
||||
from unstructured.partition.pdf_image.pdf_image_utils import extract_images_from_elements
|
||||
from unstructured.partition.pdf_image.pdfminer_utils import (
|
||||
open_pdfminer_pages_generator,
|
||||
rect_to_bbox,
|
||||
@ -381,8 +382,6 @@ def _partition_pdf_or_image_local(
|
||||
is_image=is_image,
|
||||
model_name=model_name,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
)
|
||||
|
||||
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
|
||||
@ -411,8 +410,6 @@ def _partition_pdf_or_image_local(
|
||||
is_image=is_image,
|
||||
model_name=model_name,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
)
|
||||
if hasattr(file, "seek"):
|
||||
file.seek(0)
|
||||
@ -458,6 +455,15 @@ def _partition_pdf_or_image_local(
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if extract_images_in_pdf:
|
||||
extract_images_from_elements(
|
||||
elements=elements,
|
||||
filename=filename,
|
||||
file=file,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
output_dir_path=image_output_dir_path,
|
||||
)
|
||||
|
||||
out_elements = []
|
||||
for el in elements:
|
||||
if isinstance(el, PageBreak) and not include_page_breaks:
|
||||
|
||||
124
unstructured/partition/pdf_image/pdf_image_utils.py
Normal file
124
unstructured/partition/pdf_image/pdf_image_utils.py
Normal file
@ -0,0 +1,124 @@
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import PurePath
|
||||
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pdf2image
|
||||
from PIL import Image
|
||||
|
||||
from unstructured.documents.elements import ElementType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.common import convert_to_bytes
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from unstructured.documents.elements import Element
|
||||
|
||||
|
||||
def write_image(image: Union[Image.Image, np.ndarray], output_image_path: str):
|
||||
"""
|
||||
Write an image to a specified file path, supporting both PIL Image and numpy ndarray formats.
|
||||
|
||||
Parameters:
|
||||
- image (Union[Image.Image, np.ndarray]): The image to be written, which can be in PIL Image
|
||||
format or a numpy ndarray format.
|
||||
- output_image_path (str): The path to which the image will be written.
|
||||
|
||||
Raises:
|
||||
- ValueError: If the provided image type is neither PIL Image nor numpy ndarray.
|
||||
|
||||
Returns:
|
||||
- None: The function writes the image to the specified path but does not return any value.
|
||||
"""
|
||||
|
||||
if isinstance(image, Image.Image):
|
||||
image.save(output_image_path)
|
||||
elif isinstance(image, np.ndarray):
|
||||
cv2.imwrite(output_image_path, image)
|
||||
else:
|
||||
raise ValueError("Unsupported Image Type")
|
||||
|
||||
|
||||
def convert_pdf_to_image(
|
||||
filename: str,
|
||||
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||
dpi: int = 200,
|
||||
output_folder: Optional[Union[str, PurePath]] = None,
|
||||
path_only: bool = False,
|
||||
) -> Union[List[Image.Image], List[str]]:
|
||||
"""Get the image renderings of the pdf pages using pdf2image"""
|
||||
|
||||
if path_only and not output_folder:
|
||||
raise ValueError("output_folder must be specified if path_only is true")
|
||||
|
||||
if file is not None:
|
||||
f_bytes = convert_to_bytes(file)
|
||||
images = pdf2image.convert_from_bytes(
|
||||
f_bytes,
|
||||
dpi=dpi,
|
||||
output_folder=output_folder,
|
||||
paths_only=path_only,
|
||||
)
|
||||
else:
|
||||
images = pdf2image.convert_from_path(
|
||||
filename,
|
||||
dpi=dpi,
|
||||
output_folder=output_folder,
|
||||
paths_only=path_only,
|
||||
)
|
||||
|
||||
return images
|
||||
|
||||
|
||||
def extract_images_from_elements(
|
||||
elements: List["Element"],
|
||||
pdf_image_dpi: int,
|
||||
filename: str = "",
|
||||
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||
output_dir_path: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Extract and save images from the page. This method iterates through the layout elements
|
||||
of the page, identifies image regions, and extracts and saves them as separate image files.
|
||||
"""
|
||||
|
||||
if not output_dir_path:
|
||||
output_dir_path = os.path.join(os.getcwd(), "figures")
|
||||
os.makedirs(output_dir_path, exist_ok=True)
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
_image_paths = convert_pdf_to_image(
|
||||
filename,
|
||||
file,
|
||||
pdf_image_dpi,
|
||||
output_folder=temp_dir,
|
||||
path_only=True,
|
||||
)
|
||||
image_paths = cast(List[str], _image_paths)
|
||||
|
||||
figure_number = 0
|
||||
for el in elements:
|
||||
coordinates = el.metadata.coordinates
|
||||
if not coordinates or not coordinates.points or el.category != ElementType.IMAGE:
|
||||
continue
|
||||
|
||||
points = coordinates.points
|
||||
x1, y1 = points[0]
|
||||
x2, y2 = points[2]
|
||||
page_number = el.metadata.page_number
|
||||
|
||||
figure_number += 1
|
||||
try:
|
||||
output_f_path = os.path.join(
|
||||
output_dir_path,
|
||||
f"figure-{page_number}-{figure_number}.jpg",
|
||||
)
|
||||
image_path = image_paths[page_number - 1]
|
||||
image = Image.open(image_path)
|
||||
cropped_image = image.crop((x1, y1, x2, y2))
|
||||
write_image(cropped_image, output_f_path)
|
||||
# add image path to element metadata
|
||||
el.metadata.image_path = output_f_path
|
||||
except (ValueError, IOError):
|
||||
logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)
|
||||
Loading…
x
Reference in New Issue
Block a user