fix: pass extract image args to all partitioners (#3950)

This is needed in order for the user to specify whether to extract the
base64 for images, which are now parsed by the html partitioner.

## Testing

Adds test that validates this by calling the auto-partitioner with
appropriate arguments partitioning an html file with base64 embedded
image.
This commit is contained in:
ryannikolaidis 2025-03-09 21:15:08 -07:00 committed by GitHub
parent c0457c1cc3
commit 0001a33dba
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 28 additions and 3 deletions

View File

@ -1,4 +1,4 @@
## 0.16.26-dev1
## 0.16.26-dev2
### Enhancements

File diff suppressed because one or more lines are too long

View File

@ -627,6 +627,19 @@ def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: b
)
def test_auto_partition_html_element_extraction():
extract_image_block_types = ["Image"]
with tempfile.TemporaryDirectory() as tmpdir:
elements = partition(
example_doc_path("html-with-base64-image.html"),
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=True,
)
assert_element_extraction(elements, extract_image_block_types, True, tmpdir)
def test_partition_pdf_does_not_raise_warning():
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
# per the pytest docs.

View File

@ -32,6 +32,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--input-path example-docs \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-num-files-output.sh 13 $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-num-files-output.sh 14 $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -1 +1 @@
__version__ = "0.16.26-dev1" # pragma: no cover
__version__ = "0.16.26-dev2" # pragma: no cover

View File

@ -283,6 +283,8 @@ def partition(
partitioning_kwargs["languages"] = languages
partitioning_kwargs["starting_page_number"] = starting_page_number
partitioning_kwargs["strategy"] = strategy
partitioning_kwargs["extract_image_block_types"] = extract_image_block_types
partitioning_kwargs["extract_image_block_to_payload"] = extract_image_block_to_payload
partition = partitioner_loader.get(file_type)
elements = partition(filename=filename, file=file, **partitioning_kwargs)

View File

@ -37,6 +37,8 @@ def partition_html(
detection_origin: Optional[str] = None,
html_parser_version: Literal["v1", "v2"] = "v1",
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
extract_image_block_to_payload: bool = False,
extract_image_block_types: Optional[list[str]] = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions an HTML document into its constituent elements.
@ -86,6 +88,8 @@ def partition_html(
detection_origin=detection_origin,
html_parser_version=html_parser_version,
image_alt_mode=image_alt_mode,
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
)
return list(_HtmlPartitioner.iter_elements(opts))