From 5b0ae3fd8b638f573d3a966ebeb0e3eef5d4bad2 Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Thu, 4 Jan 2024 09:52:00 -0800 Subject: [PATCH] Refactor: rename image extraction kwargs (#2303) Currently, we're using different kwarg names in partition() and partition_pdf(), which has implications for the API since it goes through partition(). ### Summary - rename `extract_element_types` -> `extract_image_block_types` - rename `image_output_dir_path` to `extract_image_block_output_dir` - rename `extract_to_payload` -> `extract_image_block_to_payload` - rename `pdf_extract_images` -> `extract_images_in_pdf` in `partition.auto` - add unit tests to test element extraction for `pdf/image` via `partition.auto` ### Testing CI should pass. --- CHANGELOG.md | 10 +++ .../partition/pdf_image/test_image.py | 22 +++--- .../partition/pdf_image/test_pdf.py | 30 ++++---- .../pdf_image/test_pdf_image_utils.py | 8 +-- test_unstructured/partition/test_auto.py | 48 ++++++++++++- .../partition/test_strategies.py | 6 +- unstructured/__version__.py | 2 +- unstructured/partition/auto.py | 47 ++++++------ unstructured/partition/image.py | 37 +++++----- unstructured/partition/pdf.py | 71 ++++++++++--------- .../partition/pdf_image/pdf_image_utils.py | 20 +++--- unstructured/partition/strategies.py | 4 +- 12 files changed, 185 insertions(+), 120 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c670045fd..32113b493 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.11.9-dev0 + +### Enhancements + +* **Rename kwargs related to extracting image blocks.** Rename the kwargs related to extracting image blocks for consistency and API usage. + +### Features + +### Fixes + ## 0.11.8 ### Enhancements diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py index f3071f52a..d0dface90 100644 --- a/test_unstructured/partition/pdf_image/test_image.py +++ b/test_unstructured/partition/pdf_image/test_image.py @@ -637,29 +637,31 @@ def test_partition_image_has_filename(inference_results): @pytest.mark.parametrize("file_mode", ["filename", "rb"]) -@pytest.mark.parametrize("extract_to_payload", [False, True]) +@pytest.mark.parametrize("extract_image_block_to_payload", [False, True]) def test_partition_image_element_extraction( file_mode, - extract_to_payload, + extract_image_block_to_payload, filename=example_doc_path("embedded-images-tables.jpg"), ): - extract_element_types = ["Image", "Table"] + extract_image_block_types = ["Image", "Table"] with tempfile.TemporaryDirectory() as tmpdir: if file_mode == "filename": elements = image.partition_image( filename=filename, - extract_element_types=extract_element_types, - extract_to_payload=extract_to_payload, - image_output_dir_path=tmpdir, + extract_image_block_types=extract_image_block_types, + extract_image_block_to_payload=extract_image_block_to_payload, + extract_image_block_output_dir=tmpdir, ) else: with open(filename, "rb") as f: elements = image.partition_image( file=f, - extract_element_types=extract_element_types, - extract_to_payload=extract_to_payload, - image_output_dir_path=tmpdir, + extract_image_block_types=extract_image_block_types, + extract_image_block_to_payload=extract_image_block_to_payload, + extract_image_block_output_dir=tmpdir, ) - assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir) + assert_element_extraction( + elements, extract_image_block_types, extract_image_block_to_payload, tmpdir + ) diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 6fa8194a6..2e8d606c9 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1128,9 +1128,11 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo assert expected_log in caplog.text -def assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir): +def assert_element_extraction( + elements, extract_image_block_types, extract_image_block_to_payload, tmpdir +): extracted_elements = [] - for el_type in extract_element_types: + for el_type in extract_image_block_types: extracted_elements_by_type = [] for el in elements: if el.category == el_type: @@ -1139,7 +1141,7 @@ def assert_element_extraction(elements, extract_element_types, extract_to_payloa for extracted_elements_by_type in extracted_elements: for i, el in enumerate(extracted_elements_by_type): - if extract_to_payload: + if extract_image_block_to_payload: assert el.metadata.image_base64 is not None assert el.metadata.image_mime_type == "image/jpeg" image_data = base64.b64decode(el.metadata.image_base64) @@ -1157,29 +1159,31 @@ def assert_element_extraction(elements, extract_element_types, extract_to_payloa @pytest.mark.parametrize("file_mode", ["filename", "rb"]) -@pytest.mark.parametrize("extract_to_payload", [False, True]) +@pytest.mark.parametrize("extract_image_block_to_payload", [False, True]) def test_partition_pdf_element_extraction( file_mode, - extract_to_payload, + extract_image_block_to_payload, filename=example_doc_path("embedded-images-tables.pdf"), ): - extract_element_types = ["Image", "Table"] + extract_image_block_types = ["Image", "Table"] with tempfile.TemporaryDirectory() as tmpdir: if file_mode == "filename": elements = pdf.partition_pdf( filename=filename, - extract_element_types=extract_element_types, - extract_to_payload=extract_to_payload, - image_output_dir_path=tmpdir, + extract_image_block_types=extract_image_block_types, + extract_image_block_to_payload=extract_image_block_to_payload, + extract_image_block_output_dir=tmpdir, ) else: with open(filename, "rb") as f: elements = pdf.partition_pdf( file=f, - extract_element_types=extract_element_types, - extract_to_payload=extract_to_payload, - image_output_dir_path=tmpdir, + extract_image_block_types=extract_image_block_types, + extract_image_block_to_payload=extract_image_block_to_payload, + extract_image_block_output_dir=tmpdir, ) - assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir) + assert_element_extraction( + elements, extract_image_block_types, extract_image_block_to_payload, tmpdir + ) diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py index 13bc2b50b..7cf823e09 100644 --- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py @@ -61,10 +61,10 @@ def test_convert_pdf_to_image( @pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE]) -@pytest.mark.parametrize("extract_to_payload", [False, True]) +@pytest.mark.parametrize("extract_image_block_to_payload", [False, True]) def test_save_elements( element_category_to_save, - extract_to_payload, + extract_image_block_to_payload, filename=example_doc_path("layout-parser-paper-fast.pdf"), ): with tempfile.TemporaryDirectory() as tmpdir: @@ -101,7 +101,7 @@ def test_save_elements( pdf_image_dpi=200, filename=filename, output_dir_path=str(tmpdir), - extract_to_payload=extract_to_payload, + extract_image_block_to_payload=extract_image_block_to_payload, ) saved_elements = [el for el in elements if el.category == element_category_to_save] @@ -110,7 +110,7 @@ def test_save_elements( expected_image_path = os.path.join( str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg" ) - if extract_to_payload: + if extract_image_block_to_payload: assert isinstance(el.metadata.image_base64, str) assert isinstance(el.metadata.image_mime_type, str) assert not el.metadata.image_path diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 4df28143d..bd2b3596f 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -1,6 +1,7 @@ import json import os import pathlib +import tempfile import warnings from importlib import import_module from unittest.mock import Mock, patch @@ -8,6 +9,7 @@ from unittest.mock import Mock, patch import docx import pytest +from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction from test_unstructured.partition.test_constants import ( EXPECTED_TABLE, EXPECTED_TABLE_XLSX, @@ -355,9 +357,9 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch): include_page_breaks=False, infer_table_structure=False, extract_images_in_pdf=False, - extract_element_types=None, - image_output_dir_path=None, - extract_to_payload=False, + extract_image_block_types=None, + extract_image_block_output_dir=None, + extract_image_block_to_payload=False, hi_res_model_name=None, ) @@ -460,6 +462,26 @@ def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, co assert elements[idx].metadata.coordinates is not None +@pytest.mark.parametrize("extract_image_block_to_payload", [False, True]) +def test_auto_partition_image_element_extraction( + extract_image_block_to_payload, + filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.jpg"), +): + extract_image_block_types = ["Image", "Table"] + + with tempfile.TemporaryDirectory() as tmpdir: + elements = partition( + filename=filename, + extract_image_block_types=extract_image_block_types, + extract_image_block_to_payload=extract_image_block_to_payload, + extract_image_block_output_dir=tmpdir, + ) + + assert_element_extraction( + elements, extract_image_block_types, extract_image_block_to_payload, tmpdir + ) + + @pytest.mark.parametrize( ("pass_metadata_filename", "content_type"), [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)], @@ -666,6 +688,26 @@ def test_auto_filetype_overrides_file_specific(content_type, expected, monkeypat assert all(el.metadata.filetype == expected for el in elements) +@pytest.mark.parametrize("extract_image_block_to_payload", [False, True]) +def test_auto_partition_pdf_element_extraction( + extract_image_block_to_payload, + filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.pdf"), +): + extract_image_block_types = ["Image", "Table"] + + with tempfile.TemporaryDirectory() as tmpdir: + elements = partition( + filename=filename, + extract_image_block_types=extract_image_block_types, + extract_image_block_to_payload=extract_image_block_to_payload, + extract_image_block_output_dir=tmpdir, + ) + + assert_element_extraction( + elements, extract_image_block_types, extract_image_block_to_payload, tmpdir + ) + + supported_filetypes = [ _ for _ in FileType diff --git a/test_unstructured/partition/test_strategies.py b/test_unstructured/partition/test_strategies.py index b51c0fbe9..fbc580bda 100644 --- a/test_unstructured/partition/test_strategies.py +++ b/test_unstructured/partition/test_strategies.py @@ -76,7 +76,7 @@ def test_determine_pdf_or_image_fast_strategy(pdf_text_extractable, infer_table_ "pdf_text_extractable", "infer_table_structure", "extract_images_in_pdf", - "extract_element_types", + "extract_image_block_types", "expected", ), [ @@ -102,7 +102,7 @@ def test_determine_pdf_auto_strategy( pdf_text_extractable, infer_table_structure, extract_images_in_pdf, - extract_element_types, + extract_image_block_types, expected, ): strategy = strategies.determine_pdf_or_image_strategy( @@ -111,7 +111,7 @@ def test_determine_pdf_auto_strategy( pdf_text_extractable=pdf_text_extractable, infer_table_structure=infer_table_structure, extract_images_in_pdf=extract_images_in_pdf, - extract_element_types=extract_element_types, + extract_image_block_types=extract_image_block_types, ) assert strategy == expected diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 320de9b63..0d1529cc6 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.11.8" # pragma: no cover +__version__ = "0.11.9-dev0" # pragma: no cover diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 23d7d9ec3..df7e8bf8a 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -136,10 +136,10 @@ def partition( languages: Optional[List[str]] = None, detect_language_per_element: bool = False, pdf_infer_table_structure: bool = False, - pdf_extract_images: bool = False, - pdf_extract_element_types: Optional[List[str]] = None, - pdf_image_output_dir_path: Optional[str] = None, - pdf_extract_to_payload: bool = False, + extract_images_in_pdf: bool = False, + extract_image_block_types: Optional[List[str]] = None, + extract_image_block_output_dir: Optional[str] = None, + extract_image_block_to_payload: bool = False, xml_keep_tags: bool = False, data_source_metadata: Optional[DataSourceMetadata] = None, metadata_filename: Optional[str] = None, @@ -194,27 +194,28 @@ def partition( additional metadata field, "text_as_html," where the value (string) is a just a transformation of the data into an HTML . The "text" field for a partitioned Table Element is always present, whether True or False. - pdf_extract_images + extract_images_in_pdf Only applicable if `strategy=hi_res`. - If True, any detected images will be saved in the path specified by 'image_output_dir_path' - or stored as base64 encoded data within metadata fields. + If True, any detected images will be saved in the path specified by + 'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields. Deprecation Note: This parameter is marked for deprecation. Future versions will use - 'extract_element_types' for broader extraction capabilities. - pdf_extract_element_types + 'extract_image_block_types' for broader extraction capabilities. + extract_image_block_types Only applicable if `strategy=hi_res`. Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be - saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data - within metadata fields. - pdf_extract_to_payload + saved in the path specified by 'extract_image_block_output_dir' or stored as base64 + encoded data within metadata fields. + extract_image_block_to_payload Only applicable if `strategy=hi_res`. - If True, images of the element type(s) defined in 'extract_element_types' will be encoded - as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'. + If True, images of the element type(s) defined in 'extract_image_block_types' will be + encoded as base64 data and stored in two metadata fields: 'image_base64' and + 'image_mime_type'. This parameter facilitates the inclusion of element data directly within the payload, especially for web-based applications or APIs. - pdf_image_output_dir_path - Only applicable if `strategy=hi_res` and `pdf_extract_to_payload=False`. + extract_image_block_output_dir + Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`. The filesystem path for saving images of the element type(s) - specified in 'extract_element_types'. + specified in 'extract_image_block_types'. xml_keep_tags If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml. @@ -413,11 +414,11 @@ def partition( infer_table_structure=infer_table_structure, strategy=strategy, languages=languages, - extract_images_in_pdf=pdf_extract_images, - extract_element_types=pdf_extract_element_types, - image_output_dir_path=pdf_image_output_dir_path, - extract_to_payload=pdf_extract_to_payload, hi_res_model_name=hi_res_model_name or model_name, + extract_images_in_pdf=extract_images_in_pdf, + extract_image_block_types=extract_image_block_types, + extract_image_block_output_dir=extract_image_block_output_dir, + extract_image_block_to_payload=extract_image_block_to_payload, **kwargs, ) elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF): @@ -430,6 +431,10 @@ def partition( strategy=strategy, languages=languages, hi_res_model_name=hi_res_model_name or model_name, + extract_images_in_pdf=extract_images_in_pdf, + extract_image_block_types=extract_image_block_types, + extract_image_block_output_dir=extract_image_block_output_dir, + extract_image_block_to_payload=extract_image_block_to_payload, **kwargs, ) elif filetype == FileType.TXT: diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index 9fb890d28..27e1fb03b 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -27,9 +27,9 @@ def partition_image( chunking_strategy: Optional[str] = None, hi_res_model_name: Optional[str] = None, extract_images_in_pdf: bool = False, - extract_element_types: Optional[List[str]] = None, - image_output_dir_path: Optional[str] = None, - extract_to_payload: bool = False, + extract_image_block_types: Optional[List[str]] = None, + extract_image_block_output_dir: Optional[str] = None, + extract_image_block_to_payload: bool = False, **kwargs, ) -> List[Element]: """Parses an image into a list of interpreted elements. @@ -64,25 +64,26 @@ def partition_image( The layout detection model used when partitioning strategy is set to `hi_res`. extract_images_in_pdf Only applicable if `strategy=hi_res`. - If True, any detected images will be saved in the path specified by 'image_output_dir_path' - or stored as base64 encoded data within metadata fields. + If True, any detected images will be saved in the path specified by + 'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields. Deprecation Note: This parameter is marked for deprecation. Future versions will use - 'extract_element_types' for broader extraction capabilities. - extract_element_types + 'extract_image_block_types' for broader extraction capabilities. + extract_image_block_types Only applicable if `strategy=hi_res`. Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be - saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data - within metadata fields. - extract_to_payload + saved in the path specified by 'extract_image_block_output_dir' or stored as base64 encoded + data within metadata fields. + extract_image_block_to_payload Only applicable if `strategy=hi_res`. - If True, images of the element type(s) defined in 'extract_element_types' will be encoded - as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'. + If True, images of the element type(s) defined in 'extract_image_block_types' will be + encoded as base64 data and stored in two metadata fields: 'image_base64' and + 'image_mime_type'. This parameter facilitates the inclusion of element data directly within the payload, especially for web-based applications or APIs. - image_output_dir_path - Only applicable if `strategy=hi_res` and `extract_to_payload=False`. + extract_image_block_output_dir + Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`. The filesystem path for saving images of the element type(s) - specified in 'extract_element_types'. + specified in 'extract_image_block_types'. """ exactly_one(filename=filename, file=file) @@ -119,8 +120,8 @@ def partition_image( metadata_last_modified=metadata_last_modified, hi_res_model_name=hi_res_model_name, extract_images_in_pdf=extract_images_in_pdf, - extract_element_types=extract_element_types, - image_output_dir_path=image_output_dir_path, - extract_to_payload=extract_to_payload, + extract_image_block_types=extract_image_block_types, + extract_image_block_output_dir=extract_image_block_output_dir, + extract_image_block_to_payload=extract_image_block_to_payload, **kwargs, ) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 9e61023da..09cae60cb 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -141,9 +141,9 @@ def partition_pdf( links: Sequence[Link] = [], hi_res_model_name: Optional[str] = None, extract_images_in_pdf: bool = False, - extract_element_types: Optional[List[str]] = None, - image_output_dir_path: Optional[str] = None, - extract_to_payload: bool = False, + extract_image_block_types: Optional[List[str]] = None, + extract_image_block_output_dir: Optional[str] = None, + extract_image_block_to_payload: bool = False, **kwargs, ) -> List[Element]: """Parses a pdf document into a list of interpreted elements. @@ -177,25 +177,26 @@ def partition_pdf( The layout detection model used when partitioning strategy is set to `hi_res`. extract_images_in_pdf Only applicable if `strategy=hi_res`. - If True, any detected images will be saved in the path specified by 'image_output_dir_path' - or stored as base64 encoded data within metadata fields. + If True, any detected images will be saved in the path specified by + 'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields. Deprecation Note: This parameter is marked for deprecation. Future versions will use - 'extract_element_types' for broader extraction capabilities. - extract_element_types + 'extract_image_block_types' for broader extraction capabilities. + extract_image_block_types Only applicable if `strategy=hi_res`. Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be - saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data - within metadata fields. - extract_to_payload + saved in the path specified by 'extract_image_block_output_dir' or stored as base64 + encoded data within metadata fields. + extract_image_block_to_payload Only applicable if `strategy=hi_res`. - If True, images of the element type(s) defined in 'extract_element_types' will be encoded - as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'. + If True, images of the element type(s) defined in 'extract_image_block_types' will be + encoded as base64 data and stored in two metadata fields: 'image_base64' and + 'image_mime_type'. This parameter facilitates the inclusion of element data directly within the payload, especially for web-based applications or APIs. - image_output_dir_path - Only applicable if `strategy=hi_res` and `extract_to_payload=False`. + extract_image_block_output_dir + Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`. The filesystem path for saving images of the element type(s) - specified in 'extract_element_types'. + specified in 'extract_image_block_types'. """ exactly_one(filename=filename, file=file) @@ -212,9 +213,9 @@ def partition_pdf( metadata_last_modified=metadata_last_modified, hi_res_model_name=hi_res_model_name, extract_images_in_pdf=extract_images_in_pdf, - extract_element_types=extract_element_types, - image_output_dir_path=image_output_dir_path, - extract_to_payload=extract_to_payload, + extract_image_block_types=extract_image_block_types, + extract_image_block_output_dir=extract_image_block_output_dir, + extract_image_block_to_payload=extract_image_block_to_payload, **kwargs, ) @@ -266,9 +267,9 @@ def _partition_pdf_or_image_local( metadata_last_modified: Optional[str] = None, pdf_text_extractable: bool = False, extract_images_in_pdf: bool = False, - extract_element_types: Optional[List[str]] = None, - image_output_dir_path: Optional[str] = None, - extract_to_payload: bool = False, + extract_image_block_types: Optional[List[str]] = None, + extract_image_block_output_dir: Optional[str] = None, + extract_image_block_to_payload: bool = False, analysis: bool = False, analyzed_image_output_dir_path: Optional[str] = None, **kwargs, @@ -406,7 +407,7 @@ def _partition_pdf_or_image_local( **kwargs, ) - extract_element_types = check_element_types_to_extract(extract_element_types) + extract_image_block_types = check_element_types_to_extract(extract_image_block_types) # NOTE(christine): `extract_images_in_pdf` would deprecate # (but continue to support for a while) if extract_images_in_pdf: @@ -417,11 +418,11 @@ def _partition_pdf_or_image_local( file=file, is_image=is_image, pdf_image_dpi=pdf_image_dpi, - extract_to_payload=extract_to_payload, - output_dir_path=image_output_dir_path, + extract_image_block_to_payload=extract_image_block_to_payload, + output_dir_path=extract_image_block_output_dir, ) - for el_type in extract_element_types: + for el_type in extract_image_block_types: if extract_images_in_pdf and el_type == ElementType.IMAGE: continue @@ -432,8 +433,8 @@ def _partition_pdf_or_image_local( file=file, is_image=is_image, pdf_image_dpi=pdf_image_dpi, - extract_to_payload=extract_to_payload, - output_dir_path=image_output_dir_path, + extract_image_block_to_payload=extract_image_block_to_payload, + output_dir_path=extract_image_block_output_dir, ) out_elements = [] @@ -444,7 +445,7 @@ def _partition_pdf_or_image_local( if isinstance(el, Image): if ( not extract_images_in_pdf - and ElementType.IMAGE not in extract_element_types + and ElementType.IMAGE not in extract_image_block_types and (el.text is None or len(el.text) < 24 or el.text.find(" ") == -1) ): # NOTE(crag): small chunks of text from Image elements tend to be garbage @@ -478,9 +479,9 @@ def partition_pdf_or_image( metadata_last_modified: Optional[str] = None, hi_res_model_name: Optional[str] = None, extract_images_in_pdf: bool = False, - extract_element_types: Optional[List[str]] = None, - image_output_dir_path: Optional[str] = None, - extract_to_payload: bool = False, + extract_image_block_types: Optional[List[str]] = None, + extract_image_block_output_dir: Optional[str] = None, + extract_image_block_to_payload: bool = False, **kwargs, ) -> List[Element]: """Parses a pdf or image document into a list of interpreted elements.""" @@ -523,7 +524,7 @@ def partition_pdf_or_image( pdf_text_extractable=pdf_text_extractable, infer_table_structure=infer_table_structure, extract_images_in_pdf=extract_images_in_pdf, - extract_element_types=extract_element_types, + extract_image_block_types=extract_image_block_types, ) if file is not None: @@ -544,9 +545,9 @@ def partition_pdf_or_image( hi_res_model_name=hi_res_model_name, pdf_text_extractable=pdf_text_extractable, extract_images_in_pdf=extract_images_in_pdf, - extract_element_types=extract_element_types, - image_output_dir_path=image_output_dir_path, - extract_to_payload=extract_to_payload, + extract_image_block_types=extract_image_block_types, + extract_image_block_output_dir=extract_image_block_output_dir, + extract_image_block_to_payload=extract_image_block_to_payload, **kwargs, ) out_elements = _process_uncategorized_text_elements(elements) diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index 5ffd8d07f..d551a46e5 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -82,7 +82,7 @@ def save_elements( filename: str = "", file: Optional[Union[bytes, BinaryIO]] = None, is_image: bool = False, - extract_to_payload: bool = False, + extract_image_block_to_payload: bool = False, output_dir_path: Optional[str] = None, ): """ @@ -143,7 +143,7 @@ def save_elements( image_path = image_paths[page_number - 1] image = Image.open(image_path) cropped_image = image.crop((x1, y1, x2, y2)) - if extract_to_payload: + if extract_image_block_to_payload: buffered = BytesIO() cropped_image.save(buffered, format="JPEG") img_base64 = base64.b64encode(buffered.getvalue()) @@ -159,28 +159,28 @@ def save_elements( def check_element_types_to_extract( - extract_element_types: Optional[List[str]], + extract_image_block_types: Optional[List[str]], ) -> List[str]: """Check and normalize the provided list of element types to extract.""" - if extract_element_types is None: + if extract_image_block_types is None: return [] - if not isinstance(extract_element_types, list): + if not isinstance(extract_image_block_types, list): raise TypeError( - "The extract_element_types parameter must be a list of element types as strings, " + "The extract_image_block_types parameter must be a list of element types as strings, " "ex. ['Table', 'Image']", ) available_element_types = list(ElementType.to_dict().values()) - normalized_extract_element_types = [] - for el_type in extract_element_types: + normalized_extract_image_block_types = [] + for el_type in extract_image_block_types: normalized_el_type = el_type.lower().capitalize() if normalized_el_type not in available_element_types: logger.warning(f"The requested type ({el_type}) doesn't match any available type") - normalized_extract_element_types.append(normalized_el_type) + normalized_extract_image_block_types.append(normalized_el_type) - return normalized_extract_element_types + return normalized_extract_image_block_types def valid_text(text: str) -> bool: diff --git a/unstructured/partition/strategies.py b/unstructured/partition/strategies.py index 98fe98582..2a3bc226c 100644 --- a/unstructured/partition/strategies.py +++ b/unstructured/partition/strategies.py @@ -27,7 +27,7 @@ def determine_pdf_or_image_strategy( pdf_text_extractable: bool = False, infer_table_structure: bool = False, extract_images_in_pdf: bool = False, - extract_element_types: Optional[List[str]] = None, + extract_image_block_types: Optional[List[str]] = None, ): """Determines what strategy to use for processing PDFs or images, accounting for fallback logic if some dependencies are not available.""" @@ -35,7 +35,7 @@ def determine_pdf_or_image_strategy( unstructured_inference_installed = dependency_exists("unstructured_inference") if strategy == PartitionStrategy.AUTO: - extract_element = extract_images_in_pdf or bool(extract_element_types) + extract_element = extract_images_in_pdf or bool(extract_image_block_types) if is_image: strategy = _determine_image_auto_strategy() else: