Feat: return base64 encoded images for PDF's (#2310)

Closes #2302. ### Summary - add functionality to get a Base64 encoded string from a PIL image - store base64 encoded image data in two metadata fields: `image_base64` and `image_mime_type` - update the "image element filter" logic to keep all image elements in the output if a user specifies image extraction ### Testing ``` from unstructured.partition.pdf import partition_pdf elements = partition_pdf( filename="example-docs/embedded-images-tables.pdf", strategy="hi_res", extract_element_types=["Image", "Table"], extract_to_payload=True, ) ``` or ``` from unstructured.partition.auto import partition elements = partition( filename="example-docs/embedded-images-tables.pdf", strategy="hi_res", pdf_extract_element_types=["Image", "Table"], pdf_extract_to_payload=True, ) ```
2025-11-04 03:53:45 +00:00 · 2023-12-26 21:39:01 -08:00 · 2023-12-26 21:39:01 -08:00 · dd144456de
commit dd144456de
parent 8ba9fadf8a
12 changed files with 1220 additions and 69 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,8 @@
 * **Update encoders to leverage dataclasses** All encoders now follow a class approach which get annotated with the dataclass decorator. Similar to the connectors, it uses a nested dataclass for the configs required to configure a client as well as a field/property approach to cache the client. This makes sure any variable associated with the class exists as a dataclass field.
 ### Features
 * **Store base64 encoded image data in metadata fields.** Rather than saving to file, stores base64 encoded data of the image bytes and the mimetype for the image in metadata fields: `image_base64` and `image_mime_type` (if that is what the user specifies by some other param like `pdf_extract_to_payload`). This would allow the API to have parity with the library.
 ### Fixes
--- a/example-docs/embedded-images-tables.jpg
+++ b/example-docs/embedded-images-tables.jpg
--- a/example-docs/embedded-images-tables.pdf
+++ b/example-docs/embedded-images-tables.pdf
--- a/test_unstructured/partition/pdf_image/test_image.py
+++ b/test_unstructured/partition/pdf_image/test_image.py
@ -1,5 +1,6 @@
 import os
 import pathlib
 import tempfile
 from unittest import mock
 import pytest
@ -7,6 +8,7 @@ from PIL import Image
 from pytesseract import TesseractError
 from unstructured_inference.inference import layout
 from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
 from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
 from unstructured.chunking.title import chunk_by_title
 from unstructured.documents.elements import ElementType
@ -632,3 +634,34 @@ def test_partition_image_has_filename(inference_results):
    assert element.metadata.filetype == "JPEG"
    # This should be kept from the filename we originally gave
    assert element.metadata.filename == filename
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("extract_to_payload", [False, True])
 def test_partition_image_element_extraction(
    file_mode,
    extract_to_payload,
    filename=example_doc_path("embedded-images-tables.jpg"),
 ):
    extract_element_types = ["Image", "Table"]
    with tempfile.TemporaryDirectory() as tmpdir:
        if file_mode == "filename":
            elements = image.partition_image(
                filename=filename,
                strategy="hi_res",
                extract_element_types=extract_element_types,
                extract_to_payload=extract_to_payload,
                image_output_dir_path=tmpdir,
            )
        else:
            with open(filename, "rb") as f:
                elements = image.partition_image(
                    file=f,
                    strategy="hi_res",
                    extract_element_types=extract_element_types,
                    extract_to_payload=extract_to_payload,
                    image_output_dir_path=tmpdir,
                )
        assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@ -1,6 +1,8 @@
 import base64
 import logging
 import math
 import os
 import tempfile
 from tempfile import SpooledTemporaryFile
 from unittest import mock
@ -15,6 +17,7 @@ from unstructured.documents.coordinates import PixelSpace
 from unstructured.documents.elements import (
    CoordinatesMetadata,
    ElementMetadata,
    ElementType,
    ListItem,
    NarrativeText,
    Text,
@ -1123,3 +1126,62 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
    caplog.set_level(logging.INFO)
    assert pdf.extractable_elements(filename=example_doc_path(filename))
    assert expected_log in caplog.text
 def assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir):
    extracted_elements = []
    for el_type in extract_element_types:
        extracted_elements_by_type = []
        for el in elements:
            if el.category == el_type:
                extracted_elements_by_type.append(el)
        extracted_elements.append(extracted_elements_by_type)
    for extracted_elements_by_type in extracted_elements:
        for i, el in enumerate(extracted_elements_by_type):
            if extract_to_payload:
                assert el.metadata.image_base64 is not None
                assert el.metadata.image_mime_type == "image/jpeg"
                image_data = base64.b64decode(el.metadata.image_base64)
                assert isinstance(image_data, bytes)
                assert el.metadata.image_path is None
            else:
                basename = "table" if el.category == ElementType.TABLE else "figure"
                expected_image_path = os.path.join(
                    str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
                )
                assert el.metadata.image_path == expected_image_path
                assert os.path.isfile(expected_image_path)
                assert el.metadata.image_base64 is None
                assert el.metadata.image_mime_type is None
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("extract_to_payload", [False, True])
 def test_partition_pdf_element_extraction(
    file_mode,
    extract_to_payload,
    filename=example_doc_path("embedded-images-tables.pdf"),
 ):
    extract_element_types = ["Image", "Table"]
    with tempfile.TemporaryDirectory() as tmpdir:
        if file_mode == "filename":
            elements = pdf.partition_pdf(
                filename=filename,
                strategy="hi_res",
                extract_element_types=extract_element_types,
                extract_to_payload=extract_to_payload,
                image_output_dir_path=tmpdir,
            )
        else:
            with open(filename, "rb") as f:
                elements = pdf.partition_pdf(
                    file=f,
                    strategy="hi_res",
                    extract_element_types=extract_element_types,
                    extract_to_payload=extract_to_payload,
                    image_output_dir_path=tmpdir,
                )
        assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
--- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
+++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@ -7,7 +7,7 @@ from PIL import Image as PILImg
 from test_unstructured.unit_utils import example_doc_path
 from unstructured.documents.coordinates import PixelSpace
-from unstructured.documents.elements import ElementMetadata, ElementType, Image
+from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table
 from unstructured.partition.pdf_image import pdf_image_utils
@ -60,58 +60,66 @@ def test_convert_pdf_to_image(
            assert isinstance(images[0], PILImg.Image)
-def test_save_elements(filename=example_doc_path("embedded-images.pdf")):
+@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
@pytest.mark.parametrize("extract_to_payload", [False, True])
 def test_save_elements(
    element_category_to_save,
    extract_to_payload,
    filename=example_doc_path("layout-parser-paper-fast.pdf"),
 ):
    with tempfile.TemporaryDirectory() as tmpdir:
        elements = [
            Image(
                text="3",
-                coordinates=(
+                coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
                    (78.7401411111111, 86.61545694444455),
                    (78.7401411111111, 519.9487805555556),
                    (512.0734647222223, 519.9487805555556),
                    (512.0734647222223, 86.61545694444455),
                ),
                coordinate_system=PixelSpace(width=1575, height=1166),
                metadata=ElementMetadata(page_number=1),
            ),
            Image(
                text="4",
-                coordinates=(
+                coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
                    (570.8661397222222, 86.6154566666667),
                    (570.8661397222222, 519.6862825000001),
                    (1003.9369655555556, 519.6862825000001),
                    (1003.9369655555556, 86.6154566666667),
                ),
                coordinate_system=PixelSpace(width=1575, height=1166),
                metadata=ElementMetadata(page_number=1),
            ),
            Image(
                text="5",
-                coordinates=(
+                coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
                    (1062.9921808333331, 86.61545694444455),
                    (1062.9921808333331, 519.9487805555556),
                    (1496.3255044444445, 519.9487805555556),
                    (1496.3255044444445, 86.61545694444455),
                ),
                coordinate_system=PixelSpace(width=1575, height=1166),
                metadata=ElementMetadata(page_number=1),
            ),
            Table(
                text="Sample Table",
                coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
                coordinate_system=PixelSpace(width=1575, height=1166),
                metadata=ElementMetadata(page_number=2),
            ),
        ]
        pdf_image_utils.save_elements(
            elements=elements,
-            element_category_to_save=ElementType.IMAGE,
+            element_category_to_save=element_category_to_save,
            pdf_image_dpi=200,
            filename=filename,
            output_dir_path=str(tmpdir),
            extract_to_payload=extract_to_payload,
        )
-        for i, el in enumerate(elements):
+        saved_elements = [el for el in elements if el.category == element_category_to_save]
        for i, el in enumerate(saved_elements):
            basename = "table" if el.category == ElementType.TABLE else "figure"
            expected_image_path = os.path.join(
-                str(tmpdir), f"figure-{el.metadata.page_number}-{i + 1}.jpg"
+                str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
            )
-            assert os.path.isfile(el.metadata.image_path)
+            if extract_to_payload:
-            assert el.metadata.image_path == expected_image_path
+                assert isinstance(el.metadata.image_base64, str)
                assert isinstance(el.metadata.image_mime_type, str)
                assert not el.metadata.image_path
                assert not os.path.isfile(expected_image_path)
            else:
                assert os.path.isfile(expected_image_path)
                assert el.metadata.image_path == expected_image_path
                assert not el.metadata.image_base64
                assert not el.metadata.image_mime_type
 def test_write_image_raises_error():
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -3,7 +3,7 @@ import os
 import pathlib
 import warnings
 from importlib import import_module
-from unittest.mock import ANY, Mock, patch
+from unittest.mock import Mock, patch
 import docx
 import pytest
@ -347,15 +347,17 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
    mock_partition.assert_called_once_with(
        filename=filename,
        metadata_filename=None,
        file=None,
        url=None,
        include_page_breaks=False,
        infer_table_structure=False,
        extract_images_in_pdf=ANY,
        image_output_dir_path=ANY,
        strategy=PartitionStrategy.FAST,
        languages=None,
        metadata_filename=None,
        include_page_breaks=False,
        infer_table_structure=False,
        extract_images_in_pdf=False,
        extract_element_types=None,
        image_output_dir_path=None,
        extract_to_payload=False,
        hi_res_model_name=None,
    )
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -173,9 +173,11 @@ class ElementMetadata:
    file_directory: Optional[str]
    filename: Optional[str]
    filetype: Optional[str]
    image_path: Optional[str]
    image_base64: Optional[str]
    image_mime_type: Optional[str]
    # -- specific to DOCX which has distinct primary, first-page, and even-page header/footers --
    header_footer_type: Optional[str]
    image_path: Optional[str]
    # -- used in chunks only, when chunk must be split mid-text to fit window --
    is_continuation: Optional[bool]
    languages: Optional[List[str]]
@ -457,6 +459,8 @@ class ConsolidationStrategy(enum.Enum):
            "filetype": cls.FIRST,
            "header_footer_type": cls.DROP,
            "image_path": cls.DROP,
            "image_base64": cls.DROP,
            "image_mime_type": cls.DROP,
            "is_continuation": cls.DROP,  # -- not expected, added by chunking, not before --
            "languages": cls.LIST_UNIQUE,
            "last_modified": cls.FIRST,
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -137,7 +137,9 @@ def partition(
    detect_language_per_element: bool = False,
    pdf_infer_table_structure: bool = False,
    pdf_extract_images: bool = False,
    pdf_extract_element_types: Optional[List[str]] = None,
    pdf_image_output_dir_path: Optional[str] = None,
    pdf_extract_to_payload: bool = False,
    xml_keep_tags: bool = False,
    data_source_metadata: Optional[DataSourceMetadata] = None,
    metadata_filename: Optional[str] = None,
@ -193,11 +195,26 @@ def partition(
        transformation of the data into an HTML <table>.
        The "text" field for a partitioned Table Element is always present, whether True or False.
    pdf_extract_images
-        If True and strategy=hi_res, any detected images will be saved in the path specified by
+        Only applicable if `strategy=hi_res`.
-        pdf_image_output_dir_path.
+        If True, any detected images will be saved in the path specified by 'image_output_dir_path'
        or stored as base64 encoded data within metadata fields.
        Deprecation Note: This parameter is marked for deprecation. Future versions will use
        'extract_element_types' for broader extraction capabilities.
    pdf_extract_element_types
        Only applicable if `strategy=hi_res`.
        Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
        saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
        within metadata fields.
    pdf_extract_to_payload
        Only applicable if `strategy=hi_res`.
        If True, images of the element type(s) defined in 'extract_element_types' will be encoded
        as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
        This parameter facilitates the inclusion of element data directly within the payload,
        especially for web-based applications or APIs.
    pdf_image_output_dir_path
-        If pdf_extract_images=True and strategy=hi_res, any detected images will be saved in the
+        Only applicable if `strategy=hi_res` and `pdf_extract_to_payload=False`.
-        given path
+        The filesystem path for saving images of the element type(s)
        specified in 'extract_element_types'.
    xml_keep_tags
        If True, will retain the XML tags in the output. Otherwise it will simply extract
        the text from within the tags. Only applies to partition_xml.
@ -397,7 +414,9 @@ def partition(
            strategy=strategy,
            languages=languages,
            extract_images_in_pdf=pdf_extract_images,
            extract_element_types=pdf_extract_element_types,
            image_output_dir_path=pdf_image_output_dir_path,
            extract_to_payload=pdf_extract_to_payload,
            hi_res_model_name=hi_res_model_name or model_name,
            **kwargs,
        )
--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@ -26,6 +26,10 @@ def partition_image(
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,
    hi_res_model_name: Optional[str] = None,
    extract_images_in_pdf: bool = False,
    extract_element_types: Optional[List[str]] = None,
    image_output_dir_path: Optional[str] = None,
    extract_to_payload: bool = False,
    **kwargs,
 ) -> List[Element]:
    """Parses an image into a list of interpreted elements.
@ -58,6 +62,27 @@ def partition_image(
        The last modified date for the document.
    hi_res_model_name
        The layout detection model used when partitioning strategy is set to `hi_res`.
    extract_images_in_pdf
        Only applicable if `strategy=hi_res`.
        If True, any detected images will be saved in the path specified by 'image_output_dir_path'
        or stored as base64 encoded data within metadata fields.
        Deprecation Note: This parameter is marked for deprecation. Future versions will use
        'extract_element_types' for broader extraction capabilities.
    extract_element_types
        Only applicable if `strategy=hi_res`.
        Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
        saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
        within metadata fields.
    extract_to_payload
        Only applicable if `strategy=hi_res`.
        If True, images of the element type(s) defined in 'extract_element_types' will be encoded
        as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
        This parameter facilitates the inclusion of element data directly within the payload,
        especially for web-based applications or APIs.
    image_output_dir_path
        Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
        The filesystem path for saving images of the element type(s)
        specified in 'extract_element_types'.
    """
    exactly_one(filename=filename, file=file)
@ -93,5 +118,9 @@ def partition_image(
        strategy=strategy,
        metadata_last_modified=metadata_last_modified,
        hi_res_model_name=hi_res_model_name,
        extract_images_in_pdf=extract_images_in_pdf,
        extract_element_types=extract_element_types,
        image_output_dir_path=image_output_dir_path,
        extract_to_payload=extract_to_payload,
        **kwargs,
    )
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -106,7 +106,6 @@ from unstructured.utils import requires_dependencies
 if TYPE_CHECKING:
    pass
 # NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
 # the bug: https://github.com/pdfminer/pdfminer.six/pull/885
 psparser.PSBaseParser._parse_keyword = parse_keyword  # type: ignore
@ -140,10 +139,11 @@ def partition_pdf(
    metadata_last_modified: Optional[str] = None,
    chunking_strategy: Optional[str] = None,  # used by decorator
    links: Sequence[Link] = [],
    hi_res_model_name: Optional[str] = None,
    extract_images_in_pdf: bool = False,
    extract_element_types: Optional[List[str]] = None,
    image_output_dir_path: Optional[str] = None,
-    hi_res_model_name: Optional[str] = None,
+    extract_to_payload: bool = False,
    **kwargs,
 ) -> List[Element]:
    """Parses a pdf document into a list of interpreted elements.
@ -173,18 +173,29 @@ def partition_pdf(
        with Tesseract, you'll first need to install the appropriate Tesseract language pack.
    metadata_last_modified
        The last modified date for the document.
    extract_images_in_pdf
        Only applicable if `strategy=hi_res`.
        If `True`, any detected images will be saved in the path specified by
        image_output_dir_path.
    extract_element_types
        Only applicable if `strategy=hi_res`.
        Images of the element type(s) defined in this list will be saved to `image_output_dir_path`.
    image_output_dir_path
        Only applicable if `strategy=hi_res`.
        The path for saving images when using `extract_images_in_pdf` or `extract_element_types`.
    hi_res_model_name
        The layout detection model used when partitioning strategy is set to `hi_res`.
    extract_images_in_pdf
        Only applicable if `strategy=hi_res`.
        If True, any detected images will be saved in the path specified by 'image_output_dir_path'
        or stored as base64 encoded data within metadata fields.
        Deprecation Note: This parameter is marked for deprecation. Future versions will use
        'extract_element_types' for broader extraction capabilities.
    extract_element_types
        Only applicable if `strategy=hi_res`.
        Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
        saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
        within metadata fields.
    extract_to_payload
        Only applicable if `strategy=hi_res`.
        If True, images of the element type(s) defined in 'extract_element_types' will be encoded
        as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
        This parameter facilitates the inclusion of element data directly within the payload,
        especially for web-based applications or APIs.
    image_output_dir_path
        Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
        The filesystem path for saving images of the element type(s)
        specified in 'extract_element_types'.
    """
    exactly_one(filename=filename, file=file)
@ -199,10 +210,11 @@ def partition_pdf(
        infer_table_structure=infer_table_structure,
        languages=languages,
        metadata_last_modified=metadata_last_modified,
        hi_res_model_name=hi_res_model_name,
        extract_images_in_pdf=extract_images_in_pdf,
        extract_element_types=extract_element_types,
        image_output_dir_path=image_output_dir_path,
-        hi_res_model_name=hi_res_model_name,
+        extract_to_payload=extract_to_payload,
        **kwargs,
    )
@ -249,13 +261,14 @@ def _partition_pdf_or_image_local(
    languages: Optional[List[str]] = None,
    ocr_mode: str = OCRMode.FULL_PAGE.value,
    model_name: Optional[str] = None,  # to be deprecated in favor of `hi_res_model_name`
    hi_res_model_name: Optional[str] = None,
    pdf_image_dpi: Optional[int] = None,
    metadata_last_modified: Optional[str] = None,
    pdf_text_extractable: bool = False,
    extract_images_in_pdf: bool = False,
    extract_element_types: Optional[List[str]] = None,
    image_output_dir_path: Optional[str] = None,
-    pdf_image_dpi: Optional[int] = None,
+    extract_to_payload: bool = False,
    hi_res_model_name: Optional[str] = None,
    analysis: bool = False,
    analyzed_image_output_dir_path: Optional[str] = None,
    **kwargs,
@ -402,7 +415,9 @@ def _partition_pdf_or_image_local(
            element_category_to_save=ElementType.IMAGE,
            filename=filename,
            file=file,
            is_image=is_image,
            pdf_image_dpi=pdf_image_dpi,
            extract_to_payload=extract_to_payload,
            output_dir_path=image_output_dir_path,
        )
@ -415,7 +430,9 @@ def _partition_pdf_or_image_local(
            element_category_to_save=el_type,
            filename=filename,
            file=file,
            is_image=is_image,
            pdf_image_dpi=pdf_image_dpi,
            extract_to_payload=extract_to_payload,
            output_dir_path=image_output_dir_path,
        )
@ -425,10 +442,12 @@ def _partition_pdf_or_image_local(
            continue
        if isinstance(el, Image):
-            # NOTE(crag): small chunks of text from Image elements tend to be garbage
+            if (
-            if not el.metadata.image_path and (
+                not extract_images_in_pdf
-                el.text is None or len(el.text) < 24 or el.text.find(" ") == -1
+                and ElementType.IMAGE not in extract_element_types
                and (el.text is None or len(el.text) < 24 or el.text.find(" ") == -1)
            ):
                # NOTE(crag): small chunks of text from Image elements tend to be garbage
                continue
            else:
                out_elements.append(cast(Element, el))
@ -457,10 +476,11 @@ def partition_pdf_or_image(
    ocr_languages: Optional[str] = None,
    languages: Optional[List[str]] = None,
    metadata_last_modified: Optional[str] = None,
    hi_res_model_name: Optional[str] = None,
    extract_images_in_pdf: bool = False,
    extract_element_types: Optional[List[str]] = None,
    image_output_dir_path: Optional[str] = None,
-    hi_res_model_name: Optional[str] = None,
+    extract_to_payload: bool = False,
    **kwargs,
 ) -> List[Element]:
    """Parses a pdf or image document into a list of interpreted elements."""
@ -518,11 +538,12 @@ def partition_pdf_or_image(
                include_page_breaks=include_page_breaks,
                languages=languages,
                metadata_last_modified=metadata_last_modified or last_modification_date,
                hi_res_model_name=hi_res_model_name,
                pdf_text_extractable=pdf_text_extractable,
                extract_images_in_pdf=extract_images_in_pdf,
                extract_element_types=extract_element_types,
                image_output_dir_path=image_output_dir_path,
-                hi_res_model_name=hi_res_model_name,
+                extract_to_payload=extract_to_payload,
                **kwargs,
            )
            out_elements = _process_uncategorized_text_elements(elements)
--- a/unstructured/partition/pdf_image/pdf_image_utils.py
+++ b/unstructured/partition/pdf_image/pdf_image_utils.py
@ -1,5 +1,7 @@
 import base64
 import os
 import tempfile
 from io import BytesIO
 from pathlib import PurePath
 from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
@ -79,11 +81,17 @@ def save_elements(
    pdf_image_dpi: int,
    filename: str = "",
    file: Optional[Union[bytes, BinaryIO]] = None,
    is_image: bool = False,
    extract_to_payload: bool = False,
    output_dir_path: Optional[str] = None,
 ):
    """
-    Extract and save images from the page. This method iterates through the layout elements
+    Saves specific elements from a PDF as images either to a directory or embeds them in the
-    of the page, identifies image regions, and extracts and saves them as separate image files.
+    element's payload.
    This function processes a list of elements partitioned from a PDF file. For each element of
    a specified category, it extracts and saves the image. The images can either be saved to
    a specified directory or embedded into the element's payload as a base64-encoded string.
    """
    if not output_dir_path:
@ -91,14 +99,25 @@ def save_elements(
    os.makedirs(output_dir_path, exist_ok=True)
    with tempfile.TemporaryDirectory() as temp_dir:
-        _image_paths = convert_pdf_to_image(
+        if is_image:
-            filename,
+            if file is None:
-            file,
+                image_paths = [filename]
-            pdf_image_dpi,
+            else:
-            output_folder=temp_dir,
+                if hasattr(file, "seek"):
-            path_only=True,
+                    file.seek(0)
-        )
+                temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
-        image_paths = cast(List[str], _image_paths)
+                temp_file.write(file.read() if hasattr(file, "read") else file)
                temp_file.flush()
                image_paths = [temp_file.name]
        else:
            _image_paths = convert_pdf_to_image(
                filename,
                file,
                pdf_image_dpi,
                output_folder=temp_dir,
                path_only=True,
            )
            image_paths = cast(List[str], _image_paths)
        figure_number = 0
        for el in elements:
@ -124,9 +143,17 @@ def save_elements(
                image_path = image_paths[page_number - 1]
                image = Image.open(image_path)
                cropped_image = image.crop((x1, y1, x2, y2))
-                write_image(cropped_image, output_f_path)
+                if extract_to_payload:
-                # add image path to element metadata
+                    buffered = BytesIO()
-                el.metadata.image_path = output_f_path
+                    cropped_image.save(buffered, format="JPEG")
                    img_base64 = base64.b64encode(buffered.getvalue())
                    img_base64_str = img_base64.decode()
                    el.metadata.image_base64 = img_base64_str
                    el.metadata.image_mime_type = "image/jpeg"
                else:
                    write_image(cropped_image, output_f_path)
                    # add image path to element metadata
                    el.metadata.image_path = output_f_path
            except (ValueError, IOError):
                logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)