mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-06 12:21:30 +00:00
fix: pass extract image args to all partitioners (#3950)
This is needed in order for the user to specify whether to extract the base64 for images, which are now parsed by the html partitioner. ## Testing Adds test that validates this by calling the auto-partitioner with appropriate arguments partitioning an html file with base64 embedded image.
This commit is contained in:
parent
c0457c1cc3
commit
0001a33dba
@ -1,4 +1,4 @@
|
||||
## 0.16.26-dev1
|
||||
## 0.16.26-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
6
example-docs/html-with-base64-image.html
Normal file
6
example-docs/html-with-base64-image.html
Normal file
File diff suppressed because one or more lines are too long
@ -627,6 +627,19 @@ def test_auto_partition_pdf_element_extraction(extract_image_block_to_payload: b
|
||||
)
|
||||
|
||||
|
||||
def test_auto_partition_html_element_extraction():
|
||||
extract_image_block_types = ["Image"]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
elements = partition(
|
||||
example_doc_path("html-with-base64-image.html"),
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_to_payload=True,
|
||||
)
|
||||
|
||||
assert_element_extraction(elements, extract_image_block_types, True, tmpdir)
|
||||
|
||||
|
||||
def test_partition_pdf_does_not_raise_warning():
|
||||
# NOTE(robinson): This is the recommended way to check that no warning is emitted,
|
||||
# per the pytest docs.
|
||||
|
||||
@ -32,6 +32,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
--input-path example-docs \
|
||||
--work-dir "$WORK_DIR"
|
||||
|
||||
"$SCRIPT_DIR"/check-num-files-output.sh 13 $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-num-files-output.sh 14 $OUTPUT_FOLDER_NAME
|
||||
|
||||
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.16.26-dev1" # pragma: no cover
|
||||
__version__ = "0.16.26-dev2" # pragma: no cover
|
||||
|
||||
@ -283,6 +283,8 @@ def partition(
|
||||
partitioning_kwargs["languages"] = languages
|
||||
partitioning_kwargs["starting_page_number"] = starting_page_number
|
||||
partitioning_kwargs["strategy"] = strategy
|
||||
partitioning_kwargs["extract_image_block_types"] = extract_image_block_types
|
||||
partitioning_kwargs["extract_image_block_to_payload"] = extract_image_block_to_payload
|
||||
|
||||
partition = partitioner_loader.get(file_type)
|
||||
elements = partition(filename=filename, file=file, **partitioning_kwargs)
|
||||
|
||||
@ -37,6 +37,8 @@ def partition_html(
|
||||
detection_origin: Optional[str] = None,
|
||||
html_parser_version: Literal["v1", "v2"] = "v1",
|
||||
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
|
||||
extract_image_block_to_payload: bool = False,
|
||||
extract_image_block_types: Optional[list[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[Element]:
|
||||
"""Partitions an HTML document into its constituent elements.
|
||||
@ -86,6 +88,8 @@ def partition_html(
|
||||
detection_origin=detection_origin,
|
||||
html_parser_version=html_parser_version,
|
||||
image_alt_mode=image_alt_mode,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
)
|
||||
|
||||
return list(_HtmlPartitioner.iter_elements(opts))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user