fix: pass extract image args to all partitioners (#3950)

This is needed in order for the user to specify whether to extract the base64 for images, which are now parsed by the html partitioner. ## Testing Adds test that validates this by calling the auto-partitioner with appropriate arguments partitioning an html file with base64 embedded image.
2026-01-06 12:21:30 +00:00 · 2025-03-09 21:15:08 -07:00 · 2025-03-09 21:15:08 -07:00 · 0001a33dba
commit 0001a33dba
parent c0457c1cc3
7 changed files with 28 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.16.26-dev1
+## 0.16.26-dev2

 ### Enhancements

--- a/example-docs/html-with-base64-image.html
+++ b/example-docs/html-with-base64-image.html
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -627,6 +627,19 @@ def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: b
        )


+def test_auto_partition_html_element_extraction():
+    extract_image_block_types = ["Image"]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        elements = partition(
+            example_doc_path("html-with-base64-image.html"),
+            extract_image_block_types=extract_image_block_types,
+            extract_image_block_to_payload=True,
+        )
+
+        assert_element_extraction(elements, extract_image_block_types, True, tmpdir)
+
+
 def test_partition_pdf_does_not_raise_warning():
    # NOTE(robinson): This is the recommended way to check that no warning is emitted,
    # per the pytest docs.
--- a/test_unstructured_ingest/src/local.sh
+++ b/test_unstructured_ingest/src/local.sh
@ -32,6 +32,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
  --input-path example-docs \
  --work-dir "$WORK_DIR"

-"$SCRIPT_DIR"/check-num-files-output.sh 13 $OUTPUT_FOLDER_NAME
+"$SCRIPT_DIR"/check-num-files-output.sh 14 $OUTPUT_FOLDER_NAME

 "$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.16.26-dev1"  # pragma: no cover
+__version__ = "0.16.26-dev2"  # pragma: no cover
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -283,6 +283,8 @@ def partition(
    partitioning_kwargs["languages"] = languages
    partitioning_kwargs["starting_page_number"] = starting_page_number
    partitioning_kwargs["strategy"] = strategy
+    partitioning_kwargs["extract_image_block_types"] = extract_image_block_types
+    partitioning_kwargs["extract_image_block_to_payload"] = extract_image_block_to_payload

    partition = partitioner_loader.get(file_type)
    elements = partition(filename=filename, file=file, **partitioning_kwargs)
--- a/unstructured/partition/html/partition.py
+++ b/unstructured/partition/html/partition.py
@ -37,6 +37,8 @@ def partition_html(
    detection_origin: Optional[str] = None,
    html_parser_version: Literal["v1", "v2"] = "v1",
    image_alt_mode: Optional[Literal["to_text"]] = "to_text",
+    extract_image_block_to_payload: bool = False,
+    extract_image_block_types: Optional[list[str]] = None,
    **kwargs: Any,
 ) -> list[Element]:
    """Partitions an HTML document into its constituent elements.
@ -86,6 +88,8 @@ def partition_html(
        detection_origin=detection_origin,
        html_parser_version=html_parser_version,
        image_alt_mode=image_alt_mode,
+        extract_image_block_types=extract_image_block_types,
+        extract_image_block_to_payload=extract_image_block_to_payload,
    )

    return list(_HtmlPartitioner.iter_elements(opts))