Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and partition_pdf(), which has implications for the API since it goes through partition(). ### Summary - rename `extract_element_types` -> `extract_image_block_types` - rename `image_output_dir_path` to `extract_image_block_output_dir` - rename `extract_to_payload` -> `extract_image_block_to_payload` - rename `pdf_extract_images` -> `extract_images_in_pdf` in `partition.auto` - add unit tests to test element extraction for `pdf/image` via `partition.auto` ### Testing CI should pass.
2025-12-24 13:44:05 +00:00 · 2024-01-04 09:52:00 -08:00 · 2024-01-04 09:52:00 -08:00 · 5b0ae3fd8b
commit 5b0ae3fd8b
parent 8e2bfcab18
12 changed files with 185 additions and 120 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,13 @@
+## 0.11.9-dev0
+
+### Enhancements
+
+* **Rename kwargs related to extracting image blocks.** Rename the kwargs related to extracting image blocks for consistency and API usage.
+
+### Features
+
+### Fixes
+
 ## 0.11.8

 ### Enhancements
--- a/test_unstructured/partition/pdf_image/test_image.py
+++ b/test_unstructured/partition/pdf_image/test_image.py
@ -637,29 +637,31 @@ def test_partition_image_has_filename(inference_results):


@pytest.mark.parametrize("file_mode", ["filename", "rb"])
-@pytest.mark.parametrize("extract_to_payload", [False, True])
+@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
 def test_partition_image_element_extraction(
    file_mode,
-    extract_to_payload,
+    extract_image_block_to_payload,
    filename=example_doc_path("embedded-images-tables.jpg"),
 ):
-    extract_element_types = ["Image", "Table"]
+    extract_image_block_types = ["Image", "Table"]

    with tempfile.TemporaryDirectory() as tmpdir:
        if file_mode == "filename":
            elements = image.partition_image(
                filename=filename,
-                extract_element_types=extract_element_types,
-                extract_to_payload=extract_to_payload,
-                image_output_dir_path=tmpdir,
+                extract_image_block_types=extract_image_block_types,
+                extract_image_block_to_payload=extract_image_block_to_payload,
+                extract_image_block_output_dir=tmpdir,
            )
        else:
            with open(filename, "rb") as f:
                elements = image.partition_image(
                    file=f,
-                    extract_element_types=extract_element_types,
-                    extract_to_payload=extract_to_payload,
-                    image_output_dir_path=tmpdir,
+                    extract_image_block_types=extract_image_block_types,
+                    extract_image_block_to_payload=extract_image_block_to_payload,
+                    extract_image_block_output_dir=tmpdir,
                )

-        assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
+        assert_element_extraction(
+            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
+        )
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@ -1128,9 +1128,11 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
    assert expected_log in caplog.text


-def assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir):
+def assert_element_extraction(
+    elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
+):
    extracted_elements = []
-    for el_type in extract_element_types:
+    for el_type in extract_image_block_types:
        extracted_elements_by_type = []
        for el in elements:
            if el.category == el_type:
@ -1139,7 +1141,7 @@ def assert_element_extraction(elements, extract_element_types, extract_to_payloa

    for extracted_elements_by_type in extracted_elements:
        for i, el in enumerate(extracted_elements_by_type):
-            if extract_to_payload:
+            if extract_image_block_to_payload:
                assert el.metadata.image_base64 is not None
                assert el.metadata.image_mime_type == "image/jpeg"
                image_data = base64.b64decode(el.metadata.image_base64)
@ -1157,29 +1159,31 @@ def assert_element_extraction(elements, extract_element_types, extract_to_payloa


@pytest.mark.parametrize("file_mode", ["filename", "rb"])
-@pytest.mark.parametrize("extract_to_payload", [False, True])
+@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
 def test_partition_pdf_element_extraction(
    file_mode,
-    extract_to_payload,
+    extract_image_block_to_payload,
    filename=example_doc_path("embedded-images-tables.pdf"),
 ):
-    extract_element_types = ["Image", "Table"]
+    extract_image_block_types = ["Image", "Table"]

    with tempfile.TemporaryDirectory() as tmpdir:
        if file_mode == "filename":
            elements = pdf.partition_pdf(
                filename=filename,
-                extract_element_types=extract_element_types,
-                extract_to_payload=extract_to_payload,
-                image_output_dir_path=tmpdir,
+                extract_image_block_types=extract_image_block_types,
+                extract_image_block_to_payload=extract_image_block_to_payload,
+                extract_image_block_output_dir=tmpdir,
            )
        else:
            with open(filename, "rb") as f:
                elements = pdf.partition_pdf(
                    file=f,
-                    extract_element_types=extract_element_types,
-                    extract_to_payload=extract_to_payload,
-                    image_output_dir_path=tmpdir,
+                    extract_image_block_types=extract_image_block_types,
+                    extract_image_block_to_payload=extract_image_block_to_payload,
+                    extract_image_block_output_dir=tmpdir,
                )

-        assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
+        assert_element_extraction(
+            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
+        )
--- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
+++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@ -61,10 +61,10 @@ def test_convert_pdf_to_image(


@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
-@pytest.mark.parametrize("extract_to_payload", [False, True])
+@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
 def test_save_elements(
    element_category_to_save,
-    extract_to_payload,
+    extract_image_block_to_payload,
    filename=example_doc_path("layout-parser-paper-fast.pdf"),
 ):
    with tempfile.TemporaryDirectory() as tmpdir:
@ -101,7 +101,7 @@ def test_save_elements(
            pdf_image_dpi=200,
            filename=filename,
            output_dir_path=str(tmpdir),
-            extract_to_payload=extract_to_payload,
+            extract_image_block_to_payload=extract_image_block_to_payload,
        )

        saved_elements = [el for el in elements if el.category == element_category_to_save]
@ -110,7 +110,7 @@ def test_save_elements(
            expected_image_path = os.path.join(
                str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
            )
-            if extract_to_payload:
+            if extract_image_block_to_payload:
                assert isinstance(el.metadata.image_base64, str)
                assert isinstance(el.metadata.image_mime_type, str)
                assert not el.metadata.image_path
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -1,6 +1,7 @@
 import json
 import os
 import pathlib
+import tempfile
 import warnings
 from importlib import import_module
 from unittest.mock import Mock, patch
@ -8,6 +9,7 @@ from unittest.mock import Mock, patch
 import docx
 import pytest

+from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
 from test_unstructured.partition.test_constants import (
    EXPECTED_TABLE,
    EXPECTED_TABLE_XLSX,
@ -355,9 +357,9 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
        include_page_breaks=False,
        infer_table_structure=False,
        extract_images_in_pdf=False,
-        extract_element_types=None,
-        image_output_dir_path=None,
-        extract_to_payload=False,
+        extract_image_block_types=None,
+        extract_image_block_output_dir=None,
+        extract_image_block_to_payload=False,
        hi_res_model_name=None,
    )

@ -460,6 +462,26 @@ def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, co
    assert elements[idx].metadata.coordinates is not None


+@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
+def test_auto_partition_image_element_extraction(
+    extract_image_block_to_payload,
+    filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.jpg"),
+):
+    extract_image_block_types = ["Image", "Table"]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        elements = partition(
+            filename=filename,
+            extract_image_block_types=extract_image_block_types,
+            extract_image_block_to_payload=extract_image_block_to_payload,
+            extract_image_block_output_dir=tmpdir,
+        )
+
+        assert_element_extraction(
+            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
+        )
+
+
@pytest.mark.parametrize(
    ("pass_metadata_filename", "content_type"),
    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
@ -666,6 +688,26 @@ def test_auto_filetype_overrides_file_specific(content_type, expected, monkeypat
    assert all(el.metadata.filetype == expected for el in elements)


+@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
+def test_auto_partition_pdf_element_extraction(
+    extract_image_block_to_payload,
+    filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.pdf"),
+):
+    extract_image_block_types = ["Image", "Table"]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        elements = partition(
+            filename=filename,
+            extract_image_block_types=extract_image_block_types,
+            extract_image_block_to_payload=extract_image_block_to_payload,
+            extract_image_block_output_dir=tmpdir,
+        )
+
+        assert_element_extraction(
+            elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
+        )
+
+
 supported_filetypes = [
    _
    for _ in FileType
--- a/test_unstructured/partition/test_strategies.py
+++ b/test_unstructured/partition/test_strategies.py
@ -76,7 +76,7 @@ def test_determine_pdf_or_image_fast_strategy(pdf_text_extractable, infer_table_
        "pdf_text_extractable",
        "infer_table_structure",
        "extract_images_in_pdf",
-        "extract_element_types",
+        "extract_image_block_types",
        "expected",
    ),
    [
@ -102,7 +102,7 @@ def test_determine_pdf_auto_strategy(
    pdf_text_extractable,
    infer_table_structure,
    extract_images_in_pdf,
-    extract_element_types,
+    extract_image_block_types,
    expected,
 ):
    strategy = strategies.determine_pdf_or_image_strategy(
@ -111,7 +111,7 @@ def test_determine_pdf_auto_strategy(
        pdf_text_extractable=pdf_text_extractable,
        infer_table_structure=infer_table_structure,
        extract_images_in_pdf=extract_images_in_pdf,
-        extract_element_types=extract_element_types,
+        extract_image_block_types=extract_image_block_types,
    )
    assert strategy == expected

--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.11.8"  # pragma: no cover
+__version__ = "0.11.9-dev0"  # pragma: no cover
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -136,10 +136,10 @@ def partition(
    languages: Optional[List[str]] = None,
    detect_language_per_element: bool = False,
    pdf_infer_table_structure: bool = False,
-    pdf_extract_images: bool = False,
-    pdf_extract_element_types: Optional[List[str]] = None,
-    pdf_image_output_dir_path: Optional[str] = None,
-    pdf_extract_to_payload: bool = False,
+    extract_images_in_pdf: bool = False,
+    extract_image_block_types: Optional[List[str]] = None,
+    extract_image_block_output_dir: Optional[str] = None,
+    extract_image_block_to_payload: bool = False,
    xml_keep_tags: bool = False,
    data_source_metadata: Optional[DataSourceMetadata] = None,
    metadata_filename: Optional[str] = None,
@ -194,27 +194,28 @@ def partition(
        additional metadata field, "text_as_html," where the value (string) is a just a
        transformation of the data into an HTML <table>.
        The "text" field for a partitioned Table Element is always present, whether True or False.
-    pdf_extract_images
+    extract_images_in_pdf
        Only applicable if `strategy=hi_res`.
-        If True, any detected images will be saved in the path specified by 'image_output_dir_path'
-        or stored as base64 encoded data within metadata fields.
+        If True, any detected images will be saved in the path specified by
+        'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
        Deprecation Note: This parameter is marked for deprecation. Future versions will use
-        'extract_element_types' for broader extraction capabilities.
-    pdf_extract_element_types
+        'extract_image_block_types' for broader extraction capabilities.
+    extract_image_block_types
        Only applicable if `strategy=hi_res`.
        Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
-        saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
-        within metadata fields.
-    pdf_extract_to_payload
+        saved in the path specified by 'extract_image_block_output_dir' or stored as base64
+        encoded data within metadata fields.
+    extract_image_block_to_payload
        Only applicable if `strategy=hi_res`.
-        If True, images of the element type(s) defined in 'extract_element_types' will be encoded
-        as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
+        If True, images of the element type(s) defined in 'extract_image_block_types' will be
+        encoded as base64 data and stored in two metadata fields: 'image_base64' and
+        'image_mime_type'.
        This parameter facilitates the inclusion of element data directly within the payload,
        especially for web-based applications or APIs.
-    pdf_image_output_dir_path
-        Only applicable if `strategy=hi_res` and `pdf_extract_to_payload=False`.
+    extract_image_block_output_dir
+        Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
        The filesystem path for saving images of the element type(s)
-        specified in 'extract_element_types'.
+        specified in 'extract_image_block_types'.
    xml_keep_tags
        If True, will retain the XML tags in the output. Otherwise it will simply extract
        the text from within the tags. Only applies to partition_xml.
@ -413,11 +414,11 @@ def partition(
            infer_table_structure=infer_table_structure,
            strategy=strategy,
            languages=languages,
-            extract_images_in_pdf=pdf_extract_images,
-            extract_element_types=pdf_extract_element_types,
-            image_output_dir_path=pdf_image_output_dir_path,
-            extract_to_payload=pdf_extract_to_payload,
            hi_res_model_name=hi_res_model_name or model_name,
+            extract_images_in_pdf=extract_images_in_pdf,
+            extract_image_block_types=extract_image_block_types,
+            extract_image_block_output_dir=extract_image_block_output_dir,
+            extract_image_block_to_payload=extract_image_block_to_payload,
            **kwargs,
        )
    elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF):
@ -430,6 +431,10 @@ def partition(
            strategy=strategy,
            languages=languages,
            hi_res_model_name=hi_res_model_name or model_name,
+            extract_images_in_pdf=extract_images_in_pdf,
+            extract_image_block_types=extract_image_block_types,
+            extract_image_block_output_dir=extract_image_block_output_dir,
+            extract_image_block_to_payload=extract_image_block_to_payload,
            **kwargs,
        )
    elif filetype == FileType.TXT:
--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@ -27,9 +27,9 @@ def partition_image(
    chunking_strategy: Optional[str] = None,
    hi_res_model_name: Optional[str] = None,
    extract_images_in_pdf: bool = False,
-    extract_element_types: Optional[List[str]] = None,
-    image_output_dir_path: Optional[str] = None,
-    extract_to_payload: bool = False,
+    extract_image_block_types: Optional[List[str]] = None,
+    extract_image_block_output_dir: Optional[str] = None,
+    extract_image_block_to_payload: bool = False,
    **kwargs,
 ) -> List[Element]:
    """Parses an image into a list of interpreted elements.
@ -64,25 +64,26 @@ def partition_image(
        The layout detection model used when partitioning strategy is set to `hi_res`.
    extract_images_in_pdf
        Only applicable if `strategy=hi_res`.
-        If True, any detected images will be saved in the path specified by 'image_output_dir_path'
-        or stored as base64 encoded data within metadata fields.
+        If True, any detected images will be saved in the path specified by
+        'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
        Deprecation Note: This parameter is marked for deprecation. Future versions will use
-        'extract_element_types' for broader extraction capabilities.
-    extract_element_types
+        'extract_image_block_types' for broader extraction capabilities.
+    extract_image_block_types
        Only applicable if `strategy=hi_res`.
        Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
-        saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
-        within metadata fields.
-    extract_to_payload
+        saved in the path specified by 'extract_image_block_output_dir' or stored as base64 encoded
+        data within metadata fields.
+    extract_image_block_to_payload
        Only applicable if `strategy=hi_res`.
-        If True, images of the element type(s) defined in 'extract_element_types' will be encoded
-        as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
+        If True, images of the element type(s) defined in 'extract_image_block_types' will be
+        encoded as base64 data and stored in two metadata fields: 'image_base64' and
+        'image_mime_type'.
        This parameter facilitates the inclusion of element data directly within the payload,
        especially for web-based applications or APIs.
-    image_output_dir_path
-        Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
+    extract_image_block_output_dir
+        Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
        The filesystem path for saving images of the element type(s)
-        specified in 'extract_element_types'.
+        specified in 'extract_image_block_types'.
    """
    exactly_one(filename=filename, file=file)

@ -119,8 +120,8 @@ def partition_image(
        metadata_last_modified=metadata_last_modified,
        hi_res_model_name=hi_res_model_name,
        extract_images_in_pdf=extract_images_in_pdf,
-        extract_element_types=extract_element_types,
-        image_output_dir_path=image_output_dir_path,
-        extract_to_payload=extract_to_payload,
+        extract_image_block_types=extract_image_block_types,
+        extract_image_block_output_dir=extract_image_block_output_dir,
+        extract_image_block_to_payload=extract_image_block_to_payload,
        **kwargs,
    )
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -141,9 +141,9 @@ def partition_pdf(
    links: Sequence[Link] = [],
    hi_res_model_name: Optional[str] = None,
    extract_images_in_pdf: bool = False,
-    extract_element_types: Optional[List[str]] = None,
-    image_output_dir_path: Optional[str] = None,
-    extract_to_payload: bool = False,
+    extract_image_block_types: Optional[List[str]] = None,
+    extract_image_block_output_dir: Optional[str] = None,
+    extract_image_block_to_payload: bool = False,
    **kwargs,
 ) -> List[Element]:
    """Parses a pdf document into a list of interpreted elements.
@ -177,25 +177,26 @@ def partition_pdf(
        The layout detection model used when partitioning strategy is set to `hi_res`.
    extract_images_in_pdf
        Only applicable if `strategy=hi_res`.
-        If True, any detected images will be saved in the path specified by 'image_output_dir_path'
-        or stored as base64 encoded data within metadata fields.
+        If True, any detected images will be saved in the path specified by
+        'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
        Deprecation Note: This parameter is marked for deprecation. Future versions will use
-        'extract_element_types' for broader extraction capabilities.
-    extract_element_types
+        'extract_image_block_types' for broader extraction capabilities.
+    extract_image_block_types
        Only applicable if `strategy=hi_res`.
        Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
-        saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
-        within metadata fields.
-    extract_to_payload
+        saved in the path specified by 'extract_image_block_output_dir' or stored as base64
+        encoded data within metadata fields.
+    extract_image_block_to_payload
        Only applicable if `strategy=hi_res`.
-        If True, images of the element type(s) defined in 'extract_element_types' will be encoded
-        as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
+        If True, images of the element type(s) defined in 'extract_image_block_types' will be
+        encoded as base64 data and stored in two metadata fields: 'image_base64' and
+        'image_mime_type'.
        This parameter facilitates the inclusion of element data directly within the payload,
        especially for web-based applications or APIs.
-    image_output_dir_path
-        Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
+    extract_image_block_output_dir
+        Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
        The filesystem path for saving images of the element type(s)
-        specified in 'extract_element_types'.
+        specified in 'extract_image_block_types'.
    """

    exactly_one(filename=filename, file=file)
@ -212,9 +213,9 @@ def partition_pdf(
        metadata_last_modified=metadata_last_modified,
        hi_res_model_name=hi_res_model_name,
        extract_images_in_pdf=extract_images_in_pdf,
-        extract_element_types=extract_element_types,
-        image_output_dir_path=image_output_dir_path,
-        extract_to_payload=extract_to_payload,
+        extract_image_block_types=extract_image_block_types,
+        extract_image_block_output_dir=extract_image_block_output_dir,
+        extract_image_block_to_payload=extract_image_block_to_payload,
        **kwargs,
    )

@ -266,9 +267,9 @@ def _partition_pdf_or_image_local(
    metadata_last_modified: Optional[str] = None,
    pdf_text_extractable: bool = False,
    extract_images_in_pdf: bool = False,
-    extract_element_types: Optional[List[str]] = None,
-    image_output_dir_path: Optional[str] = None,
-    extract_to_payload: bool = False,
+    extract_image_block_types: Optional[List[str]] = None,
+    extract_image_block_output_dir: Optional[str] = None,
+    extract_image_block_to_payload: bool = False,
    analysis: bool = False,
    analyzed_image_output_dir_path: Optional[str] = None,
    **kwargs,
@ -406,7 +407,7 @@ def _partition_pdf_or_image_local(
        **kwargs,
    )

-    extract_element_types = check_element_types_to_extract(extract_element_types)
+    extract_image_block_types = check_element_types_to_extract(extract_image_block_types)
    #  NOTE(christine): `extract_images_in_pdf` would deprecate
    #  (but continue to support for a while)
    if extract_images_in_pdf:
@ -417,11 +418,11 @@ def _partition_pdf_or_image_local(
            file=file,
            is_image=is_image,
            pdf_image_dpi=pdf_image_dpi,
-            extract_to_payload=extract_to_payload,
-            output_dir_path=image_output_dir_path,
+            extract_image_block_to_payload=extract_image_block_to_payload,
+            output_dir_path=extract_image_block_output_dir,
        )

-    for el_type in extract_element_types:
+    for el_type in extract_image_block_types:
        if extract_images_in_pdf and el_type == ElementType.IMAGE:
            continue

@ -432,8 +433,8 @@ def _partition_pdf_or_image_local(
            file=file,
            is_image=is_image,
            pdf_image_dpi=pdf_image_dpi,
-            extract_to_payload=extract_to_payload,
-            output_dir_path=image_output_dir_path,
+            extract_image_block_to_payload=extract_image_block_to_payload,
+            output_dir_path=extract_image_block_output_dir,
        )

    out_elements = []
@ -444,7 +445,7 @@ def _partition_pdf_or_image_local(
        if isinstance(el, Image):
            if (
                not extract_images_in_pdf
-                and ElementType.IMAGE not in extract_element_types
+                and ElementType.IMAGE not in extract_image_block_types
                and (el.text is None or len(el.text) < 24 or el.text.find(" ") == -1)
            ):
                # NOTE(crag): small chunks of text from Image elements tend to be garbage
@ -478,9 +479,9 @@ def partition_pdf_or_image(
    metadata_last_modified: Optional[str] = None,
    hi_res_model_name: Optional[str] = None,
    extract_images_in_pdf: bool = False,
-    extract_element_types: Optional[List[str]] = None,
-    image_output_dir_path: Optional[str] = None,
-    extract_to_payload: bool = False,
+    extract_image_block_types: Optional[List[str]] = None,
+    extract_image_block_output_dir: Optional[str] = None,
+    extract_image_block_to_payload: bool = False,
    **kwargs,
 ) -> List[Element]:
    """Parses a pdf or image document into a list of interpreted elements."""
@ -523,7 +524,7 @@ def partition_pdf_or_image(
        pdf_text_extractable=pdf_text_extractable,
        infer_table_structure=infer_table_structure,
        extract_images_in_pdf=extract_images_in_pdf,
-        extract_element_types=extract_element_types,
+        extract_image_block_types=extract_image_block_types,
    )

    if file is not None:
@ -544,9 +545,9 @@ def partition_pdf_or_image(
                hi_res_model_name=hi_res_model_name,
                pdf_text_extractable=pdf_text_extractable,
                extract_images_in_pdf=extract_images_in_pdf,
-                extract_element_types=extract_element_types,
-                image_output_dir_path=image_output_dir_path,
-                extract_to_payload=extract_to_payload,
+                extract_image_block_types=extract_image_block_types,
+                extract_image_block_output_dir=extract_image_block_output_dir,
+                extract_image_block_to_payload=extract_image_block_to_payload,
                **kwargs,
            )
            out_elements = _process_uncategorized_text_elements(elements)
--- a/unstructured/partition/pdf_image/pdf_image_utils.py
+++ b/unstructured/partition/pdf_image/pdf_image_utils.py
@ -82,7 +82,7 @@ def save_elements(
    filename: str = "",
    file: Optional[Union[bytes, BinaryIO]] = None,
    is_image: bool = False,
-    extract_to_payload: bool = False,
+    extract_image_block_to_payload: bool = False,
    output_dir_path: Optional[str] = None,
 ):
    """
@ -143,7 +143,7 @@ def save_elements(
                image_path = image_paths[page_number - 1]
                image = Image.open(image_path)
                cropped_image = image.crop((x1, y1, x2, y2))
-                if extract_to_payload:
+                if extract_image_block_to_payload:
                    buffered = BytesIO()
                    cropped_image.save(buffered, format="JPEG")
                    img_base64 = base64.b64encode(buffered.getvalue())
@ -159,28 +159,28 @@ def save_elements(


 def check_element_types_to_extract(
-    extract_element_types: Optional[List[str]],
+    extract_image_block_types: Optional[List[str]],
 ) -> List[str]:
    """Check and normalize the provided list of element types to extract."""

-    if extract_element_types is None:
+    if extract_image_block_types is None:
        return []

-    if not isinstance(extract_element_types, list):
+    if not isinstance(extract_image_block_types, list):
        raise TypeError(
-            "The extract_element_types parameter must be a list of element types as strings, "
+            "The extract_image_block_types parameter must be a list of element types as strings, "
            "ex. ['Table', 'Image']",
        )

    available_element_types = list(ElementType.to_dict().values())
-    normalized_extract_element_types = []
-    for el_type in extract_element_types:
+    normalized_extract_image_block_types = []
+    for el_type in extract_image_block_types:
        normalized_el_type = el_type.lower().capitalize()
        if normalized_el_type not in available_element_types:
            logger.warning(f"The requested type ({el_type}) doesn't match any available type")
-        normalized_extract_element_types.append(normalized_el_type)
+        normalized_extract_image_block_types.append(normalized_el_type)

-    return normalized_extract_element_types
+    return normalized_extract_image_block_types


 def valid_text(text: str) -> bool:
--- a/unstructured/partition/strategies.py
+++ b/unstructured/partition/strategies.py
@ -27,7 +27,7 @@ def determine_pdf_or_image_strategy(
    pdf_text_extractable: bool = False,
    infer_table_structure: bool = False,
    extract_images_in_pdf: bool = False,
-    extract_element_types: Optional[List[str]] = None,
+    extract_image_block_types: Optional[List[str]] = None,
 ):
    """Determines what strategy to use for processing PDFs or images, accounting for fallback
    logic if some dependencies are not available."""
@ -35,7 +35,7 @@ def determine_pdf_or_image_strategy(
    unstructured_inference_installed = dependency_exists("unstructured_inference")

    if strategy == PartitionStrategy.AUTO:
-        extract_element = extract_images_in_pdf or bool(extract_element_types)
+        extract_element = extract_images_in_pdf or bool(extract_image_block_types)
        if is_image:
            strategy = _determine_image_auto_strategy()
        else: