Feat: return base64 encoded images for PDF's (#2310)

Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition

elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
This commit is contained in:
Christine Straub 2023-12-26 21:39:01 -08:00 committed by GitHub
parent 8ba9fadf8a
commit dd144456de
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 1220 additions and 69 deletions

View File

@ -6,6 +6,8 @@
* **Update encoders to leverage dataclasses** All encoders now follow a class approach which get annotated with the dataclass decorator. Similar to the connectors, it uses a nested dataclass for the configs required to configure a client as well as a field/property approach to cache the client. This makes sure any variable associated with the class exists as a dataclass field.
### Features
* **Store base64 encoded image data in metadata fields.** Rather than saving to file, stores base64 encoded data of the image bytes and the mimetype for the image in metadata fields: `image_base64` and `image_mime_type` (if that is what the user specifies by some other param like `pdf_extract_to_payload`). This would allow the API to have parity with the library.
### Fixes

Binary file not shown.

After

Width:  |  Height:  |  Size: 251 KiB

File diff suppressed because one or more lines are too long

View File

@ -1,5 +1,6 @@
import os
import pathlib
import tempfile
from unittest import mock
import pytest
@ -7,6 +8,7 @@ from PIL import Image
from pytesseract import TesseractError
from unstructured_inference.inference import layout
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import ElementType
@ -632,3 +634,34 @@ def test_partition_image_has_filename(inference_results):
assert element.metadata.filetype == "JPEG"
# This should be kept from the filename we originally gave
assert element.metadata.filename == filename
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("extract_to_payload", [False, True])
def test_partition_image_element_extraction(
file_mode,
extract_to_payload,
filename=example_doc_path("embedded-images-tables.jpg"),
):
extract_element_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
if file_mode == "filename":
elements = image.partition_image(
filename=filename,
strategy="hi_res",
extract_element_types=extract_element_types,
extract_to_payload=extract_to_payload,
image_output_dir_path=tmpdir,
)
else:
with open(filename, "rb") as f:
elements = image.partition_image(
file=f,
strategy="hi_res",
extract_element_types=extract_element_types,
extract_to_payload=extract_to_payload,
image_output_dir_path=tmpdir,
)
assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)

View File

@ -1,6 +1,8 @@
import base64
import logging
import math
import os
import tempfile
from tempfile import SpooledTemporaryFile
from unittest import mock
@ -15,6 +17,7 @@ from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import (
CoordinatesMetadata,
ElementMetadata,
ElementType,
ListItem,
NarrativeText,
Text,
@ -1123,3 +1126,62 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
caplog.set_level(logging.INFO)
assert pdf.extractable_elements(filename=example_doc_path(filename))
assert expected_log in caplog.text
def assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir):
extracted_elements = []
for el_type in extract_element_types:
extracted_elements_by_type = []
for el in elements:
if el.category == el_type:
extracted_elements_by_type.append(el)
extracted_elements.append(extracted_elements_by_type)
for extracted_elements_by_type in extracted_elements:
for i, el in enumerate(extracted_elements_by_type):
if extract_to_payload:
assert el.metadata.image_base64 is not None
assert el.metadata.image_mime_type == "image/jpeg"
image_data = base64.b64decode(el.metadata.image_base64)
assert isinstance(image_data, bytes)
assert el.metadata.image_path is None
else:
basename = "table" if el.category == ElementType.TABLE else "figure"
expected_image_path = os.path.join(
str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
)
assert el.metadata.image_path == expected_image_path
assert os.path.isfile(expected_image_path)
assert el.metadata.image_base64 is None
assert el.metadata.image_mime_type is None
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("extract_to_payload", [False, True])
def test_partition_pdf_element_extraction(
file_mode,
extract_to_payload,
filename=example_doc_path("embedded-images-tables.pdf"),
):
extract_element_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
if file_mode == "filename":
elements = pdf.partition_pdf(
filename=filename,
strategy="hi_res",
extract_element_types=extract_element_types,
extract_to_payload=extract_to_payload,
image_output_dir_path=tmpdir,
)
else:
with open(filename, "rb") as f:
elements = pdf.partition_pdf(
file=f,
strategy="hi_res",
extract_element_types=extract_element_types,
extract_to_payload=extract_to_payload,
image_output_dir_path=tmpdir,
)
assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)

View File

@ -7,7 +7,7 @@ from PIL import Image as PILImg
from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import ElementMetadata, ElementType, Image
from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table
from unstructured.partition.pdf_image import pdf_image_utils
@ -60,58 +60,66 @@ def test_convert_pdf_to_image(
assert isinstance(images[0], PILImg.Image)
def test_save_elements(filename=example_doc_path("embedded-images.pdf")):
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
@pytest.mark.parametrize("extract_to_payload", [False, True])
def test_save_elements(
element_category_to_save,
extract_to_payload,
filename=example_doc_path("layout-parser-paper-fast.pdf"),
):
with tempfile.TemporaryDirectory() as tmpdir:
elements = [
Image(
text="3",
coordinates=(
(78.7401411111111, 86.61545694444455),
(78.7401411111111, 519.9487805555556),
(512.0734647222223, 519.9487805555556),
(512.0734647222223, 86.61545694444455),
),
coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
Image(
text="4",
coordinates=(
(570.8661397222222, 86.6154566666667),
(570.8661397222222, 519.6862825000001),
(1003.9369655555556, 519.6862825000001),
(1003.9369655555556, 86.6154566666667),
),
coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
Image(
text="5",
coordinates=(
(1062.9921808333331, 86.61545694444455),
(1062.9921808333331, 519.9487805555556),
(1496.3255044444445, 519.9487805555556),
(1496.3255044444445, 86.61545694444455),
),
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
Table(
text="Sample Table",
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=2),
),
]
pdf_image_utils.save_elements(
elements=elements,
element_category_to_save=ElementType.IMAGE,
element_category_to_save=element_category_to_save,
pdf_image_dpi=200,
filename=filename,
output_dir_path=str(tmpdir),
extract_to_payload=extract_to_payload,
)
for i, el in enumerate(elements):
saved_elements = [el for el in elements if el.category == element_category_to_save]
for i, el in enumerate(saved_elements):
basename = "table" if el.category == ElementType.TABLE else "figure"
expected_image_path = os.path.join(
str(tmpdir), f"figure-{el.metadata.page_number}-{i + 1}.jpg"
str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
)
assert os.path.isfile(el.metadata.image_path)
assert el.metadata.image_path == expected_image_path
if extract_to_payload:
assert isinstance(el.metadata.image_base64, str)
assert isinstance(el.metadata.image_mime_type, str)
assert not el.metadata.image_path
assert not os.path.isfile(expected_image_path)
else:
assert os.path.isfile(expected_image_path)
assert el.metadata.image_path == expected_image_path
assert not el.metadata.image_base64
assert not el.metadata.image_mime_type
def test_write_image_raises_error():

View File

@ -3,7 +3,7 @@ import os
import pathlib
import warnings
from importlib import import_module
from unittest.mock import ANY, Mock, patch
from unittest.mock import Mock, patch
import docx
import pytest
@ -347,15 +347,17 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
mock_partition.assert_called_once_with(
filename=filename,
metadata_filename=None,
file=None,
url=None,
include_page_breaks=False,
infer_table_structure=False,
extract_images_in_pdf=ANY,
image_output_dir_path=ANY,
strategy=PartitionStrategy.FAST,
languages=None,
metadata_filename=None,
include_page_breaks=False,
infer_table_structure=False,
extract_images_in_pdf=False,
extract_element_types=None,
image_output_dir_path=None,
extract_to_payload=False,
hi_res_model_name=None,
)

View File

@ -173,9 +173,11 @@ class ElementMetadata:
file_directory: Optional[str]
filename: Optional[str]
filetype: Optional[str]
image_path: Optional[str]
image_base64: Optional[str]
image_mime_type: Optional[str]
# -- specific to DOCX which has distinct primary, first-page, and even-page header/footers --
header_footer_type: Optional[str]
image_path: Optional[str]
# -- used in chunks only, when chunk must be split mid-text to fit window --
is_continuation: Optional[bool]
languages: Optional[List[str]]
@ -457,6 +459,8 @@ class ConsolidationStrategy(enum.Enum):
"filetype": cls.FIRST,
"header_footer_type": cls.DROP,
"image_path": cls.DROP,
"image_base64": cls.DROP,
"image_mime_type": cls.DROP,
"is_continuation": cls.DROP, # -- not expected, added by chunking, not before --
"languages": cls.LIST_UNIQUE,
"last_modified": cls.FIRST,

View File

@ -137,7 +137,9 @@ def partition(
detect_language_per_element: bool = False,
pdf_infer_table_structure: bool = False,
pdf_extract_images: bool = False,
pdf_extract_element_types: Optional[List[str]] = None,
pdf_image_output_dir_path: Optional[str] = None,
pdf_extract_to_payload: bool = False,
xml_keep_tags: bool = False,
data_source_metadata: Optional[DataSourceMetadata] = None,
metadata_filename: Optional[str] = None,
@ -193,11 +195,26 @@ def partition(
transformation of the data into an HTML <table>.
The "text" field for a partitioned Table Element is always present, whether True or False.
pdf_extract_images
If True and strategy=hi_res, any detected images will be saved in the path specified by
pdf_image_output_dir_path.
Only applicable if `strategy=hi_res`.
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
or stored as base64 encoded data within metadata fields.
Deprecation Note: This parameter is marked for deprecation. Future versions will use
'extract_element_types' for broader extraction capabilities.
pdf_extract_element_types
Only applicable if `strategy=hi_res`.
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
within metadata fields.
pdf_extract_to_payload
Only applicable if `strategy=hi_res`.
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
This parameter facilitates the inclusion of element data directly within the payload,
especially for web-based applications or APIs.
pdf_image_output_dir_path
If pdf_extract_images=True and strategy=hi_res, any detected images will be saved in the
given path
Only applicable if `strategy=hi_res` and `pdf_extract_to_payload=False`.
The filesystem path for saving images of the element type(s)
specified in 'extract_element_types'.
xml_keep_tags
If True, will retain the XML tags in the output. Otherwise it will simply extract
the text from within the tags. Only applies to partition_xml.
@ -397,7 +414,9 @@ def partition(
strategy=strategy,
languages=languages,
extract_images_in_pdf=pdf_extract_images,
extract_element_types=pdf_extract_element_types,
image_output_dir_path=pdf_image_output_dir_path,
extract_to_payload=pdf_extract_to_payload,
hi_res_model_name=hi_res_model_name or model_name,
**kwargs,
)

View File

@ -26,6 +26,10 @@ def partition_image(
metadata_last_modified: Optional[str] = None,
chunking_strategy: Optional[str] = None,
hi_res_model_name: Optional[str] = None,
extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None,
extract_to_payload: bool = False,
**kwargs,
) -> List[Element]:
"""Parses an image into a list of interpreted elements.
@ -58,6 +62,27 @@ def partition_image(
The last modified date for the document.
hi_res_model_name
The layout detection model used when partitioning strategy is set to `hi_res`.
extract_images_in_pdf
Only applicable if `strategy=hi_res`.
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
or stored as base64 encoded data within metadata fields.
Deprecation Note: This parameter is marked for deprecation. Future versions will use
'extract_element_types' for broader extraction capabilities.
extract_element_types
Only applicable if `strategy=hi_res`.
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
within metadata fields.
extract_to_payload
Only applicable if `strategy=hi_res`.
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
This parameter facilitates the inclusion of element data directly within the payload,
especially for web-based applications or APIs.
image_output_dir_path
Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
The filesystem path for saving images of the element type(s)
specified in 'extract_element_types'.
"""
exactly_one(filename=filename, file=file)
@ -93,5 +118,9 @@ def partition_image(
strategy=strategy,
metadata_last_modified=metadata_last_modified,
hi_res_model_name=hi_res_model_name,
extract_images_in_pdf=extract_images_in_pdf,
extract_element_types=extract_element_types,
image_output_dir_path=image_output_dir_path,
extract_to_payload=extract_to_payload,
**kwargs,
)

View File

@ -106,7 +106,6 @@ from unstructured.utils import requires_dependencies
if TYPE_CHECKING:
pass
# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore
@ -140,10 +139,11 @@ def partition_pdf(
metadata_last_modified: Optional[str] = None,
chunking_strategy: Optional[str] = None, # used by decorator
links: Sequence[Link] = [],
hi_res_model_name: Optional[str] = None,
extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None,
hi_res_model_name: Optional[str] = None,
extract_to_payload: bool = False,
**kwargs,
) -> List[Element]:
"""Parses a pdf document into a list of interpreted elements.
@ -173,18 +173,29 @@ def partition_pdf(
with Tesseract, you'll first need to install the appropriate Tesseract language pack.
metadata_last_modified
The last modified date for the document.
extract_images_in_pdf
Only applicable if `strategy=hi_res`.
If `True`, any detected images will be saved in the path specified by
image_output_dir_path.
extract_element_types
Only applicable if `strategy=hi_res`.
Images of the element type(s) defined in this list will be saved to `image_output_dir_path`.
image_output_dir_path
Only applicable if `strategy=hi_res`.
The path for saving images when using `extract_images_in_pdf` or `extract_element_types`.
hi_res_model_name
The layout detection model used when partitioning strategy is set to `hi_res`.
extract_images_in_pdf
Only applicable if `strategy=hi_res`.
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
or stored as base64 encoded data within metadata fields.
Deprecation Note: This parameter is marked for deprecation. Future versions will use
'extract_element_types' for broader extraction capabilities.
extract_element_types
Only applicable if `strategy=hi_res`.
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
within metadata fields.
extract_to_payload
Only applicable if `strategy=hi_res`.
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
This parameter facilitates the inclusion of element data directly within the payload,
especially for web-based applications or APIs.
image_output_dir_path
Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
The filesystem path for saving images of the element type(s)
specified in 'extract_element_types'.
"""
exactly_one(filename=filename, file=file)
@ -199,10 +210,11 @@ def partition_pdf(
infer_table_structure=infer_table_structure,
languages=languages,
metadata_last_modified=metadata_last_modified,
hi_res_model_name=hi_res_model_name,
extract_images_in_pdf=extract_images_in_pdf,
extract_element_types=extract_element_types,
image_output_dir_path=image_output_dir_path,
hi_res_model_name=hi_res_model_name,
extract_to_payload=extract_to_payload,
**kwargs,
)
@ -249,13 +261,14 @@ def _partition_pdf_or_image_local(
languages: Optional[List[str]] = None,
ocr_mode: str = OCRMode.FULL_PAGE.value,
model_name: Optional[str] = None, # to be deprecated in favor of `hi_res_model_name`
hi_res_model_name: Optional[str] = None,
pdf_image_dpi: Optional[int] = None,
metadata_last_modified: Optional[str] = None,
pdf_text_extractable: bool = False,
extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None,
pdf_image_dpi: Optional[int] = None,
hi_res_model_name: Optional[str] = None,
extract_to_payload: bool = False,
analysis: bool = False,
analyzed_image_output_dir_path: Optional[str] = None,
**kwargs,
@ -402,7 +415,9 @@ def _partition_pdf_or_image_local(
element_category_to_save=ElementType.IMAGE,
filename=filename,
file=file,
is_image=is_image,
pdf_image_dpi=pdf_image_dpi,
extract_to_payload=extract_to_payload,
output_dir_path=image_output_dir_path,
)
@ -415,7 +430,9 @@ def _partition_pdf_or_image_local(
element_category_to_save=el_type,
filename=filename,
file=file,
is_image=is_image,
pdf_image_dpi=pdf_image_dpi,
extract_to_payload=extract_to_payload,
output_dir_path=image_output_dir_path,
)
@ -425,10 +442,12 @@ def _partition_pdf_or_image_local(
continue
if isinstance(el, Image):
# NOTE(crag): small chunks of text from Image elements tend to be garbage
if not el.metadata.image_path and (
el.text is None or len(el.text) < 24 or el.text.find(" ") == -1
if (
not extract_images_in_pdf
and ElementType.IMAGE not in extract_element_types
and (el.text is None or len(el.text) < 24 or el.text.find(" ") == -1)
):
# NOTE(crag): small chunks of text from Image elements tend to be garbage
continue
else:
out_elements.append(cast(Element, el))
@ -457,10 +476,11 @@ def partition_pdf_or_image(
ocr_languages: Optional[str] = None,
languages: Optional[List[str]] = None,
metadata_last_modified: Optional[str] = None,
hi_res_model_name: Optional[str] = None,
extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None,
hi_res_model_name: Optional[str] = None,
extract_to_payload: bool = False,
**kwargs,
) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
@ -518,11 +538,12 @@ def partition_pdf_or_image(
include_page_breaks=include_page_breaks,
languages=languages,
metadata_last_modified=metadata_last_modified or last_modification_date,
hi_res_model_name=hi_res_model_name,
pdf_text_extractable=pdf_text_extractable,
extract_images_in_pdf=extract_images_in_pdf,
extract_element_types=extract_element_types,
image_output_dir_path=image_output_dir_path,
hi_res_model_name=hi_res_model_name,
extract_to_payload=extract_to_payload,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)

View File

@ -1,5 +1,7 @@
import base64
import os
import tempfile
from io import BytesIO
from pathlib import PurePath
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
@ -79,11 +81,17 @@ def save_elements(
pdf_image_dpi: int,
filename: str = "",
file: Optional[Union[bytes, BinaryIO]] = None,
is_image: bool = False,
extract_to_payload: bool = False,
output_dir_path: Optional[str] = None,
):
"""
Extract and save images from the page. This method iterates through the layout elements
of the page, identifies image regions, and extracts and saves them as separate image files.
Saves specific elements from a PDF as images either to a directory or embeds them in the
element's payload.
This function processes a list of elements partitioned from a PDF file. For each element of
a specified category, it extracts and saves the image. The images can either be saved to
a specified directory or embedded into the element's payload as a base64-encoded string.
"""
if not output_dir_path:
@ -91,14 +99,25 @@ def save_elements(
os.makedirs(output_dir_path, exist_ok=True)
with tempfile.TemporaryDirectory() as temp_dir:
_image_paths = convert_pdf_to_image(
filename,
file,
pdf_image_dpi,
output_folder=temp_dir,
path_only=True,
)
image_paths = cast(List[str], _image_paths)
if is_image:
if file is None:
image_paths = [filename]
else:
if hasattr(file, "seek"):
file.seek(0)
temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
temp_file.write(file.read() if hasattr(file, "read") else file)
temp_file.flush()
image_paths = [temp_file.name]
else:
_image_paths = convert_pdf_to_image(
filename,
file,
pdf_image_dpi,
output_folder=temp_dir,
path_only=True,
)
image_paths = cast(List[str], _image_paths)
figure_number = 0
for el in elements:
@ -124,9 +143,17 @@ def save_elements(
image_path = image_paths[page_number - 1]
image = Image.open(image_path)
cropped_image = image.crop((x1, y1, x2, y2))
write_image(cropped_image, output_f_path)
# add image path to element metadata
el.metadata.image_path = output_f_path
if extract_to_payload:
buffered = BytesIO()
cropped_image.save(buffered, format="JPEG")
img_base64 = base64.b64encode(buffered.getvalue())
img_base64_str = img_base64.decode()
el.metadata.image_base64 = img_base64_str
el.metadata.image_mime_type = "image/jpeg"
else:
write_image(cropped_image, output_f_path)
# add image path to element metadata
el.metadata.image_path = output_f_path
except (ValueError, IOError):
logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)