Refactor: rename image extraction kwargs (#2303)

Currently, we're using different kwarg names in partition() and
partition_pdf(), which has implications for the API since it goes
through partition().

### Summary
- rename `extract_element_types` -> `extract_image_block_types`
- rename `image_output_dir_path` to `extract_image_block_output_dir`
- rename `extract_to_payload` -> `extract_image_block_to_payload`
- rename `pdf_extract_images` -> `extract_images_in_pdf` in
`partition.auto`
- add unit tests to test element extraction for `pdf/image` via
`partition.auto`
### Testing
CI should pass.
This commit is contained in:
Christine Straub 2024-01-04 09:52:00 -08:00 committed by GitHub
parent 8e2bfcab18
commit 5b0ae3fd8b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 185 additions and 120 deletions

View File

@ -1,3 +1,13 @@
## 0.11.9-dev0
### Enhancements
* **Rename kwargs related to extracting image blocks.** Rename the kwargs related to extracting image blocks for consistency and API usage.
### Features
### Fixes
## 0.11.8
### Enhancements

View File

@ -637,29 +637,31 @@ def test_partition_image_has_filename(inference_results):
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("extract_to_payload", [False, True])
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_partition_image_element_extraction(
file_mode,
extract_to_payload,
extract_image_block_to_payload,
filename=example_doc_path("embedded-images-tables.jpg"),
):
extract_element_types = ["Image", "Table"]
extract_image_block_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
if file_mode == "filename":
elements = image.partition_image(
filename=filename,
extract_element_types=extract_element_types,
extract_to_payload=extract_to_payload,
image_output_dir_path=tmpdir,
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
)
else:
with open(filename, "rb") as f:
elements = image.partition_image(
file=f,
extract_element_types=extract_element_types,
extract_to_payload=extract_to_payload,
image_output_dir_path=tmpdir,
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
)
assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
assert_element_extraction(
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
)

View File

@ -1128,9 +1128,11 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
assert expected_log in caplog.text
def assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir):
def assert_element_extraction(
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
):
extracted_elements = []
for el_type in extract_element_types:
for el_type in extract_image_block_types:
extracted_elements_by_type = []
for el in elements:
if el.category == el_type:
@ -1139,7 +1141,7 @@ def assert_element_extraction(elements, extract_element_types, extract_to_payloa
for extracted_elements_by_type in extracted_elements:
for i, el in enumerate(extracted_elements_by_type):
if extract_to_payload:
if extract_image_block_to_payload:
assert el.metadata.image_base64 is not None
assert el.metadata.image_mime_type == "image/jpeg"
image_data = base64.b64decode(el.metadata.image_base64)
@ -1157,29 +1159,31 @@ def assert_element_extraction(elements, extract_element_types, extract_to_payloa
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("extract_to_payload", [False, True])
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_partition_pdf_element_extraction(
file_mode,
extract_to_payload,
extract_image_block_to_payload,
filename=example_doc_path("embedded-images-tables.pdf"),
):
extract_element_types = ["Image", "Table"]
extract_image_block_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
if file_mode == "filename":
elements = pdf.partition_pdf(
filename=filename,
extract_element_types=extract_element_types,
extract_to_payload=extract_to_payload,
image_output_dir_path=tmpdir,
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
)
else:
with open(filename, "rb") as f:
elements = pdf.partition_pdf(
file=f,
extract_element_types=extract_element_types,
extract_to_payload=extract_to_payload,
image_output_dir_path=tmpdir,
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
)
assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
assert_element_extraction(
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
)

View File

@ -61,10 +61,10 @@ def test_convert_pdf_to_image(
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
@pytest.mark.parametrize("extract_to_payload", [False, True])
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_save_elements(
element_category_to_save,
extract_to_payload,
extract_image_block_to_payload,
filename=example_doc_path("layout-parser-paper-fast.pdf"),
):
with tempfile.TemporaryDirectory() as tmpdir:
@ -101,7 +101,7 @@ def test_save_elements(
pdf_image_dpi=200,
filename=filename,
output_dir_path=str(tmpdir),
extract_to_payload=extract_to_payload,
extract_image_block_to_payload=extract_image_block_to_payload,
)
saved_elements = [el for el in elements if el.category == element_category_to_save]
@ -110,7 +110,7 @@ def test_save_elements(
expected_image_path = os.path.join(
str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
)
if extract_to_payload:
if extract_image_block_to_payload:
assert isinstance(el.metadata.image_base64, str)
assert isinstance(el.metadata.image_mime_type, str)
assert not el.metadata.image_path

View File

@ -1,6 +1,7 @@
import json
import os
import pathlib
import tempfile
import warnings
from importlib import import_module
from unittest.mock import Mock, patch
@ -8,6 +9,7 @@ from unittest.mock import Mock, patch
import docx
import pytest
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
from test_unstructured.partition.test_constants import (
EXPECTED_TABLE,
EXPECTED_TABLE_XLSX,
@ -355,9 +357,9 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
include_page_breaks=False,
infer_table_structure=False,
extract_images_in_pdf=False,
extract_element_types=None,
image_output_dir_path=None,
extract_to_payload=False,
extract_image_block_types=None,
extract_image_block_output_dir=None,
extract_image_block_to_payload=False,
hi_res_model_name=None,
)
@ -460,6 +462,26 @@ def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, co
assert elements[idx].metadata.coordinates is not None
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_auto_partition_image_element_extraction(
extract_image_block_to_payload,
filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.jpg"),
):
extract_image_block_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
elements = partition(
filename=filename,
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
)
assert_element_extraction(
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
)
@pytest.mark.parametrize(
("pass_metadata_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
@ -666,6 +688,26 @@ def test_auto_filetype_overrides_file_specific(content_type, expected, monkeypat
assert all(el.metadata.filetype == expected for el in elements)
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_auto_partition_pdf_element_extraction(
extract_image_block_to_payload,
filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.pdf"),
):
extract_image_block_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
elements = partition(
filename=filename,
extract_image_block_types=extract_image_block_types,
extract_image_block_to_payload=extract_image_block_to_payload,
extract_image_block_output_dir=tmpdir,
)
assert_element_extraction(
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
)
supported_filetypes = [
_
for _ in FileType

View File

@ -76,7 +76,7 @@ def test_determine_pdf_or_image_fast_strategy(pdf_text_extractable, infer_table_
"pdf_text_extractable",
"infer_table_structure",
"extract_images_in_pdf",
"extract_element_types",
"extract_image_block_types",
"expected",
),
[
@ -102,7 +102,7 @@ def test_determine_pdf_auto_strategy(
pdf_text_extractable,
infer_table_structure,
extract_images_in_pdf,
extract_element_types,
extract_image_block_types,
expected,
):
strategy = strategies.determine_pdf_or_image_strategy(
@ -111,7 +111,7 @@ def test_determine_pdf_auto_strategy(
pdf_text_extractable=pdf_text_extractable,
infer_table_structure=infer_table_structure,
extract_images_in_pdf=extract_images_in_pdf,
extract_element_types=extract_element_types,
extract_image_block_types=extract_image_block_types,
)
assert strategy == expected

View File

@ -1 +1 @@
__version__ = "0.11.8" # pragma: no cover
__version__ = "0.11.9-dev0" # pragma: no cover

View File

@ -136,10 +136,10 @@ def partition(
languages: Optional[List[str]] = None,
detect_language_per_element: bool = False,
pdf_infer_table_structure: bool = False,
pdf_extract_images: bool = False,
pdf_extract_element_types: Optional[List[str]] = None,
pdf_image_output_dir_path: Optional[str] = None,
pdf_extract_to_payload: bool = False,
extract_images_in_pdf: bool = False,
extract_image_block_types: Optional[List[str]] = None,
extract_image_block_output_dir: Optional[str] = None,
extract_image_block_to_payload: bool = False,
xml_keep_tags: bool = False,
data_source_metadata: Optional[DataSourceMetadata] = None,
metadata_filename: Optional[str] = None,
@ -194,27 +194,28 @@ def partition(
additional metadata field, "text_as_html," where the value (string) is a just a
transformation of the data into an HTML <table>.
The "text" field for a partitioned Table Element is always present, whether True or False.
pdf_extract_images
extract_images_in_pdf
Only applicable if `strategy=hi_res`.
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
or stored as base64 encoded data within metadata fields.
If True, any detected images will be saved in the path specified by
'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
Deprecation Note: This parameter is marked for deprecation. Future versions will use
'extract_element_types' for broader extraction capabilities.
pdf_extract_element_types
'extract_image_block_types' for broader extraction capabilities.
extract_image_block_types
Only applicable if `strategy=hi_res`.
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
within metadata fields.
pdf_extract_to_payload
saved in the path specified by 'extract_image_block_output_dir' or stored as base64
encoded data within metadata fields.
extract_image_block_to_payload
Only applicable if `strategy=hi_res`.
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
If True, images of the element type(s) defined in 'extract_image_block_types' will be
encoded as base64 data and stored in two metadata fields: 'image_base64' and
'image_mime_type'.
This parameter facilitates the inclusion of element data directly within the payload,
especially for web-based applications or APIs.
pdf_image_output_dir_path
Only applicable if `strategy=hi_res` and `pdf_extract_to_payload=False`.
extract_image_block_output_dir
Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
The filesystem path for saving images of the element type(s)
specified in 'extract_element_types'.
specified in 'extract_image_block_types'.
xml_keep_tags
If True, will retain the XML tags in the output. Otherwise it will simply extract
the text from within the tags. Only applies to partition_xml.
@ -413,11 +414,11 @@ def partition(
infer_table_structure=infer_table_structure,
strategy=strategy,
languages=languages,
extract_images_in_pdf=pdf_extract_images,
extract_element_types=pdf_extract_element_types,
image_output_dir_path=pdf_image_output_dir_path,
extract_to_payload=pdf_extract_to_payload,
hi_res_model_name=hi_res_model_name or model_name,
extract_images_in_pdf=extract_images_in_pdf,
extract_image_block_types=extract_image_block_types,
extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload,
**kwargs,
)
elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF):
@ -430,6 +431,10 @@ def partition(
strategy=strategy,
languages=languages,
hi_res_model_name=hi_res_model_name or model_name,
extract_images_in_pdf=extract_images_in_pdf,
extract_image_block_types=extract_image_block_types,
extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload,
**kwargs,
)
elif filetype == FileType.TXT:

View File

@ -27,9 +27,9 @@ def partition_image(
chunking_strategy: Optional[str] = None,
hi_res_model_name: Optional[str] = None,
extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None,
extract_to_payload: bool = False,
extract_image_block_types: Optional[List[str]] = None,
extract_image_block_output_dir: Optional[str] = None,
extract_image_block_to_payload: bool = False,
**kwargs,
) -> List[Element]:
"""Parses an image into a list of interpreted elements.
@ -64,25 +64,26 @@ def partition_image(
The layout detection model used when partitioning strategy is set to `hi_res`.
extract_images_in_pdf
Only applicable if `strategy=hi_res`.
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
or stored as base64 encoded data within metadata fields.
If True, any detected images will be saved in the path specified by
'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
Deprecation Note: This parameter is marked for deprecation. Future versions will use
'extract_element_types' for broader extraction capabilities.
extract_element_types
'extract_image_block_types' for broader extraction capabilities.
extract_image_block_types
Only applicable if `strategy=hi_res`.
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
within metadata fields.
extract_to_payload
saved in the path specified by 'extract_image_block_output_dir' or stored as base64 encoded
data within metadata fields.
extract_image_block_to_payload
Only applicable if `strategy=hi_res`.
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
If True, images of the element type(s) defined in 'extract_image_block_types' will be
encoded as base64 data and stored in two metadata fields: 'image_base64' and
'image_mime_type'.
This parameter facilitates the inclusion of element data directly within the payload,
especially for web-based applications or APIs.
image_output_dir_path
Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
extract_image_block_output_dir
Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
The filesystem path for saving images of the element type(s)
specified in 'extract_element_types'.
specified in 'extract_image_block_types'.
"""
exactly_one(filename=filename, file=file)
@ -119,8 +120,8 @@ def partition_image(
metadata_last_modified=metadata_last_modified,
hi_res_model_name=hi_res_model_name,
extract_images_in_pdf=extract_images_in_pdf,
extract_element_types=extract_element_types,
image_output_dir_path=image_output_dir_path,
extract_to_payload=extract_to_payload,
extract_image_block_types=extract_image_block_types,
extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload,
**kwargs,
)

View File

@ -141,9 +141,9 @@ def partition_pdf(
links: Sequence[Link] = [],
hi_res_model_name: Optional[str] = None,
extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None,
extract_to_payload: bool = False,
extract_image_block_types: Optional[List[str]] = None,
extract_image_block_output_dir: Optional[str] = None,
extract_image_block_to_payload: bool = False,
**kwargs,
) -> List[Element]:
"""Parses a pdf document into a list of interpreted elements.
@ -177,25 +177,26 @@ def partition_pdf(
The layout detection model used when partitioning strategy is set to `hi_res`.
extract_images_in_pdf
Only applicable if `strategy=hi_res`.
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
or stored as base64 encoded data within metadata fields.
If True, any detected images will be saved in the path specified by
'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
Deprecation Note: This parameter is marked for deprecation. Future versions will use
'extract_element_types' for broader extraction capabilities.
extract_element_types
'extract_image_block_types' for broader extraction capabilities.
extract_image_block_types
Only applicable if `strategy=hi_res`.
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
within metadata fields.
extract_to_payload
saved in the path specified by 'extract_image_block_output_dir' or stored as base64
encoded data within metadata fields.
extract_image_block_to_payload
Only applicable if `strategy=hi_res`.
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
If True, images of the element type(s) defined in 'extract_image_block_types' will be
encoded as base64 data and stored in two metadata fields: 'image_base64' and
'image_mime_type'.
This parameter facilitates the inclusion of element data directly within the payload,
especially for web-based applications or APIs.
image_output_dir_path
Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
extract_image_block_output_dir
Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
The filesystem path for saving images of the element type(s)
specified in 'extract_element_types'.
specified in 'extract_image_block_types'.
"""
exactly_one(filename=filename, file=file)
@ -212,9 +213,9 @@ def partition_pdf(
metadata_last_modified=metadata_last_modified,
hi_res_model_name=hi_res_model_name,
extract_images_in_pdf=extract_images_in_pdf,
extract_element_types=extract_element_types,
image_output_dir_path=image_output_dir_path,
extract_to_payload=extract_to_payload,
extract_image_block_types=extract_image_block_types,
extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload,
**kwargs,
)
@ -266,9 +267,9 @@ def _partition_pdf_or_image_local(
metadata_last_modified: Optional[str] = None,
pdf_text_extractable: bool = False,
extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None,
extract_to_payload: bool = False,
extract_image_block_types: Optional[List[str]] = None,
extract_image_block_output_dir: Optional[str] = None,
extract_image_block_to_payload: bool = False,
analysis: bool = False,
analyzed_image_output_dir_path: Optional[str] = None,
**kwargs,
@ -406,7 +407,7 @@ def _partition_pdf_or_image_local(
**kwargs,
)
extract_element_types = check_element_types_to_extract(extract_element_types)
extract_image_block_types = check_element_types_to_extract(extract_image_block_types)
# NOTE(christine): `extract_images_in_pdf` would deprecate
# (but continue to support for a while)
if extract_images_in_pdf:
@ -417,11 +418,11 @@ def _partition_pdf_or_image_local(
file=file,
is_image=is_image,
pdf_image_dpi=pdf_image_dpi,
extract_to_payload=extract_to_payload,
output_dir_path=image_output_dir_path,
extract_image_block_to_payload=extract_image_block_to_payload,
output_dir_path=extract_image_block_output_dir,
)
for el_type in extract_element_types:
for el_type in extract_image_block_types:
if extract_images_in_pdf and el_type == ElementType.IMAGE:
continue
@ -432,8 +433,8 @@ def _partition_pdf_or_image_local(
file=file,
is_image=is_image,
pdf_image_dpi=pdf_image_dpi,
extract_to_payload=extract_to_payload,
output_dir_path=image_output_dir_path,
extract_image_block_to_payload=extract_image_block_to_payload,
output_dir_path=extract_image_block_output_dir,
)
out_elements = []
@ -444,7 +445,7 @@ def _partition_pdf_or_image_local(
if isinstance(el, Image):
if (
not extract_images_in_pdf
and ElementType.IMAGE not in extract_element_types
and ElementType.IMAGE not in extract_image_block_types
and (el.text is None or len(el.text) < 24 or el.text.find(" ") == -1)
):
# NOTE(crag): small chunks of text from Image elements tend to be garbage
@ -478,9 +479,9 @@ def partition_pdf_or_image(
metadata_last_modified: Optional[str] = None,
hi_res_model_name: Optional[str] = None,
extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None,
extract_to_payload: bool = False,
extract_image_block_types: Optional[List[str]] = None,
extract_image_block_output_dir: Optional[str] = None,
extract_image_block_to_payload: bool = False,
**kwargs,
) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements."""
@ -523,7 +524,7 @@ def partition_pdf_or_image(
pdf_text_extractable=pdf_text_extractable,
infer_table_structure=infer_table_structure,
extract_images_in_pdf=extract_images_in_pdf,
extract_element_types=extract_element_types,
extract_image_block_types=extract_image_block_types,
)
if file is not None:
@ -544,9 +545,9 @@ def partition_pdf_or_image(
hi_res_model_name=hi_res_model_name,
pdf_text_extractable=pdf_text_extractable,
extract_images_in_pdf=extract_images_in_pdf,
extract_element_types=extract_element_types,
image_output_dir_path=image_output_dir_path,
extract_to_payload=extract_to_payload,
extract_image_block_types=extract_image_block_types,
extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload,
**kwargs,
)
out_elements = _process_uncategorized_text_elements(elements)

View File

@ -82,7 +82,7 @@ def save_elements(
filename: str = "",
file: Optional[Union[bytes, BinaryIO]] = None,
is_image: bool = False,
extract_to_payload: bool = False,
extract_image_block_to_payload: bool = False,
output_dir_path: Optional[str] = None,
):
"""
@ -143,7 +143,7 @@ def save_elements(
image_path = image_paths[page_number - 1]
image = Image.open(image_path)
cropped_image = image.crop((x1, y1, x2, y2))
if extract_to_payload:
if extract_image_block_to_payload:
buffered = BytesIO()
cropped_image.save(buffered, format="JPEG")
img_base64 = base64.b64encode(buffered.getvalue())
@ -159,28 +159,28 @@ def save_elements(
def check_element_types_to_extract(
extract_element_types: Optional[List[str]],
extract_image_block_types: Optional[List[str]],
) -> List[str]:
"""Check and normalize the provided list of element types to extract."""
if extract_element_types is None:
if extract_image_block_types is None:
return []
if not isinstance(extract_element_types, list):
if not isinstance(extract_image_block_types, list):
raise TypeError(
"The extract_element_types parameter must be a list of element types as strings, "
"The extract_image_block_types parameter must be a list of element types as strings, "
"ex. ['Table', 'Image']",
)
available_element_types = list(ElementType.to_dict().values())
normalized_extract_element_types = []
for el_type in extract_element_types:
normalized_extract_image_block_types = []
for el_type in extract_image_block_types:
normalized_el_type = el_type.lower().capitalize()
if normalized_el_type not in available_element_types:
logger.warning(f"The requested type ({el_type}) doesn't match any available type")
normalized_extract_element_types.append(normalized_el_type)
normalized_extract_image_block_types.append(normalized_el_type)
return normalized_extract_element_types
return normalized_extract_image_block_types
def valid_text(text: str) -> bool:

View File

@ -27,7 +27,7 @@ def determine_pdf_or_image_strategy(
pdf_text_extractable: bool = False,
infer_table_structure: bool = False,
extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None,
extract_image_block_types: Optional[List[str]] = None,
):
"""Determines what strategy to use for processing PDFs or images, accounting for fallback
logic if some dependencies are not available."""
@ -35,7 +35,7 @@ def determine_pdf_or_image_strategy(
unstructured_inference_installed = dependency_exists("unstructured_inference")
if strategy == PartitionStrategy.AUTO:
extract_element = extract_images_in_pdf or bool(extract_element_types)
extract_element = extract_images_in_pdf or bool(extract_image_block_types)
if is_image:
strategy = _determine_image_auto_strategy()
else: