mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 13:44:05 +00:00
Refactor: rename image extraction kwargs (#2303)
Currently, we're using different kwarg names in partition() and partition_pdf(), which has implications for the API since it goes through partition(). ### Summary - rename `extract_element_types` -> `extract_image_block_types` - rename `image_output_dir_path` to `extract_image_block_output_dir` - rename `extract_to_payload` -> `extract_image_block_to_payload` - rename `pdf_extract_images` -> `extract_images_in_pdf` in `partition.auto` - add unit tests to test element extraction for `pdf/image` via `partition.auto` ### Testing CI should pass.
This commit is contained in:
parent
8e2bfcab18
commit
5b0ae3fd8b
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.11.9-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Rename kwargs related to extracting image blocks.** Rename the kwargs related to extracting image blocks for consistency and API usage.
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.11.8
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -637,29 +637,31 @@ def test_partition_image_has_filename(inference_results):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
||||
@pytest.mark.parametrize("extract_to_payload", [False, True])
|
||||
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
||||
def test_partition_image_element_extraction(
|
||||
file_mode,
|
||||
extract_to_payload,
|
||||
extract_image_block_to_payload,
|
||||
filename=example_doc_path("embedded-images-tables.jpg"),
|
||||
):
|
||||
extract_element_types = ["Image", "Table"]
|
||||
extract_image_block_types = ["Image", "Table"]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
if file_mode == "filename":
|
||||
elements = image.partition_image(
|
||||
filename=filename,
|
||||
extract_element_types=extract_element_types,
|
||||
extract_to_payload=extract_to_payload,
|
||||
image_output_dir_path=tmpdir,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
extract_image_block_output_dir=tmpdir,
|
||||
)
|
||||
else:
|
||||
with open(filename, "rb") as f:
|
||||
elements = image.partition_image(
|
||||
file=f,
|
||||
extract_element_types=extract_element_types,
|
||||
extract_to_payload=extract_to_payload,
|
||||
image_output_dir_path=tmpdir,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
extract_image_block_output_dir=tmpdir,
|
||||
)
|
||||
|
||||
assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
|
||||
assert_element_extraction(
|
||||
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
||||
)
|
||||
|
||||
@ -1128,9 +1128,11 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
|
||||
assert expected_log in caplog.text
|
||||
|
||||
|
||||
def assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir):
|
||||
def assert_element_extraction(
|
||||
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
||||
):
|
||||
extracted_elements = []
|
||||
for el_type in extract_element_types:
|
||||
for el_type in extract_image_block_types:
|
||||
extracted_elements_by_type = []
|
||||
for el in elements:
|
||||
if el.category == el_type:
|
||||
@ -1139,7 +1141,7 @@ def assert_element_extraction(elements, extract_element_types, extract_to_payloa
|
||||
|
||||
for extracted_elements_by_type in extracted_elements:
|
||||
for i, el in enumerate(extracted_elements_by_type):
|
||||
if extract_to_payload:
|
||||
if extract_image_block_to_payload:
|
||||
assert el.metadata.image_base64 is not None
|
||||
assert el.metadata.image_mime_type == "image/jpeg"
|
||||
image_data = base64.b64decode(el.metadata.image_base64)
|
||||
@ -1157,29 +1159,31 @@ def assert_element_extraction(elements, extract_element_types, extract_to_payloa
|
||||
|
||||
|
||||
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
||||
@pytest.mark.parametrize("extract_to_payload", [False, True])
|
||||
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
||||
def test_partition_pdf_element_extraction(
|
||||
file_mode,
|
||||
extract_to_payload,
|
||||
extract_image_block_to_payload,
|
||||
filename=example_doc_path("embedded-images-tables.pdf"),
|
||||
):
|
||||
extract_element_types = ["Image", "Table"]
|
||||
extract_image_block_types = ["Image", "Table"]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
if file_mode == "filename":
|
||||
elements = pdf.partition_pdf(
|
||||
filename=filename,
|
||||
extract_element_types=extract_element_types,
|
||||
extract_to_payload=extract_to_payload,
|
||||
image_output_dir_path=tmpdir,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
extract_image_block_output_dir=tmpdir,
|
||||
)
|
||||
else:
|
||||
with open(filename, "rb") as f:
|
||||
elements = pdf.partition_pdf(
|
||||
file=f,
|
||||
extract_element_types=extract_element_types,
|
||||
extract_to_payload=extract_to_payload,
|
||||
image_output_dir_path=tmpdir,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
extract_image_block_output_dir=tmpdir,
|
||||
)
|
||||
|
||||
assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
|
||||
assert_element_extraction(
|
||||
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
||||
)
|
||||
|
||||
@ -61,10 +61,10 @@ def test_convert_pdf_to_image(
|
||||
|
||||
|
||||
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
|
||||
@pytest.mark.parametrize("extract_to_payload", [False, True])
|
||||
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
||||
def test_save_elements(
|
||||
element_category_to_save,
|
||||
extract_to_payload,
|
||||
extract_image_block_to_payload,
|
||||
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@ -101,7 +101,7 @@ def test_save_elements(
|
||||
pdf_image_dpi=200,
|
||||
filename=filename,
|
||||
output_dir_path=str(tmpdir),
|
||||
extract_to_payload=extract_to_payload,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
)
|
||||
|
||||
saved_elements = [el for el in elements if el.category == element_category_to_save]
|
||||
@ -110,7 +110,7 @@ def test_save_elements(
|
||||
expected_image_path = os.path.join(
|
||||
str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
|
||||
)
|
||||
if extract_to_payload:
|
||||
if extract_image_block_to_payload:
|
||||
assert isinstance(el.metadata.image_base64, str)
|
||||
assert isinstance(el.metadata.image_mime_type, str)
|
||||
assert not el.metadata.image_path
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
import tempfile
|
||||
import warnings
|
||||
from importlib import import_module
|
||||
from unittest.mock import Mock, patch
|
||||
@ -8,6 +9,7 @@ from unittest.mock import Mock, patch
|
||||
import docx
|
||||
import pytest
|
||||
|
||||
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
|
||||
from test_unstructured.partition.test_constants import (
|
||||
EXPECTED_TABLE,
|
||||
EXPECTED_TABLE_XLSX,
|
||||
@ -355,9 +357,9 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
|
||||
include_page_breaks=False,
|
||||
infer_table_structure=False,
|
||||
extract_images_in_pdf=False,
|
||||
extract_element_types=None,
|
||||
image_output_dir_path=None,
|
||||
extract_to_payload=False,
|
||||
extract_image_block_types=None,
|
||||
extract_image_block_output_dir=None,
|
||||
extract_image_block_to_payload=False,
|
||||
hi_res_model_name=None,
|
||||
)
|
||||
|
||||
@ -460,6 +462,26 @@ def test_auto_partition_image_default_strategy_hi_res(pass_metadata_filename, co
|
||||
assert elements[idx].metadata.coordinates is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
||||
def test_auto_partition_image_element_extraction(
|
||||
extract_image_block_to_payload,
|
||||
filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.jpg"),
|
||||
):
|
||||
extract_image_block_types = ["Image", "Table"]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
elements = partition(
|
||||
filename=filename,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
extract_image_block_output_dir=tmpdir,
|
||||
)
|
||||
|
||||
assert_element_extraction(
|
||||
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("pass_metadata_filename", "content_type"),
|
||||
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
|
||||
@ -666,6 +688,26 @@ def test_auto_filetype_overrides_file_specific(content_type, expected, monkeypat
|
||||
assert all(el.metadata.filetype == expected for el in elements)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
||||
def test_auto_partition_pdf_element_extraction(
|
||||
extract_image_block_to_payload,
|
||||
filename=os.path.join(EXAMPLE_DOCS_DIRECTORY, "embedded-images-tables.pdf"),
|
||||
):
|
||||
extract_image_block_types = ["Image", "Table"]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
elements = partition(
|
||||
filename=filename,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
extract_image_block_output_dir=tmpdir,
|
||||
)
|
||||
|
||||
assert_element_extraction(
|
||||
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
||||
)
|
||||
|
||||
|
||||
supported_filetypes = [
|
||||
_
|
||||
for _ in FileType
|
||||
|
||||
@ -76,7 +76,7 @@ def test_determine_pdf_or_image_fast_strategy(pdf_text_extractable, infer_table_
|
||||
"pdf_text_extractable",
|
||||
"infer_table_structure",
|
||||
"extract_images_in_pdf",
|
||||
"extract_element_types",
|
||||
"extract_image_block_types",
|
||||
"expected",
|
||||
),
|
||||
[
|
||||
@ -102,7 +102,7 @@ def test_determine_pdf_auto_strategy(
|
||||
pdf_text_extractable,
|
||||
infer_table_structure,
|
||||
extract_images_in_pdf,
|
||||
extract_element_types,
|
||||
extract_image_block_types,
|
||||
expected,
|
||||
):
|
||||
strategy = strategies.determine_pdf_or_image_strategy(
|
||||
@ -111,7 +111,7 @@ def test_determine_pdf_auto_strategy(
|
||||
pdf_text_extractable=pdf_text_extractable,
|
||||
infer_table_structure=infer_table_structure,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_element_types=extract_element_types,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
)
|
||||
assert strategy == expected
|
||||
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.11.8" # pragma: no cover
|
||||
__version__ = "0.11.9-dev0" # pragma: no cover
|
||||
|
||||
@ -136,10 +136,10 @@ def partition(
|
||||
languages: Optional[List[str]] = None,
|
||||
detect_language_per_element: bool = False,
|
||||
pdf_infer_table_structure: bool = False,
|
||||
pdf_extract_images: bool = False,
|
||||
pdf_extract_element_types: Optional[List[str]] = None,
|
||||
pdf_image_output_dir_path: Optional[str] = None,
|
||||
pdf_extract_to_payload: bool = False,
|
||||
extract_images_in_pdf: bool = False,
|
||||
extract_image_block_types: Optional[List[str]] = None,
|
||||
extract_image_block_output_dir: Optional[str] = None,
|
||||
extract_image_block_to_payload: bool = False,
|
||||
xml_keep_tags: bool = False,
|
||||
data_source_metadata: Optional[DataSourceMetadata] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
@ -194,27 +194,28 @@ def partition(
|
||||
additional metadata field, "text_as_html," where the value (string) is a just a
|
||||
transformation of the data into an HTML <table>.
|
||||
The "text" field for a partitioned Table Element is always present, whether True or False.
|
||||
pdf_extract_images
|
||||
extract_images_in_pdf
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
|
||||
or stored as base64 encoded data within metadata fields.
|
||||
If True, any detected images will be saved in the path specified by
|
||||
'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
|
||||
Deprecation Note: This parameter is marked for deprecation. Future versions will use
|
||||
'extract_element_types' for broader extraction capabilities.
|
||||
pdf_extract_element_types
|
||||
'extract_image_block_types' for broader extraction capabilities.
|
||||
extract_image_block_types
|
||||
Only applicable if `strategy=hi_res`.
|
||||
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
|
||||
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
|
||||
within metadata fields.
|
||||
pdf_extract_to_payload
|
||||
saved in the path specified by 'extract_image_block_output_dir' or stored as base64
|
||||
encoded data within metadata fields.
|
||||
extract_image_block_to_payload
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
|
||||
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
|
||||
If True, images of the element type(s) defined in 'extract_image_block_types' will be
|
||||
encoded as base64 data and stored in two metadata fields: 'image_base64' and
|
||||
'image_mime_type'.
|
||||
This parameter facilitates the inclusion of element data directly within the payload,
|
||||
especially for web-based applications or APIs.
|
||||
pdf_image_output_dir_path
|
||||
Only applicable if `strategy=hi_res` and `pdf_extract_to_payload=False`.
|
||||
extract_image_block_output_dir
|
||||
Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
|
||||
The filesystem path for saving images of the element type(s)
|
||||
specified in 'extract_element_types'.
|
||||
specified in 'extract_image_block_types'.
|
||||
xml_keep_tags
|
||||
If True, will retain the XML tags in the output. Otherwise it will simply extract
|
||||
the text from within the tags. Only applies to partition_xml.
|
||||
@ -413,11 +414,11 @@ def partition(
|
||||
infer_table_structure=infer_table_structure,
|
||||
strategy=strategy,
|
||||
languages=languages,
|
||||
extract_images_in_pdf=pdf_extract_images,
|
||||
extract_element_types=pdf_extract_element_types,
|
||||
image_output_dir_path=pdf_image_output_dir_path,
|
||||
extract_to_payload=pdf_extract_to_payload,
|
||||
hi_res_model_name=hi_res_model_name or model_name,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_output_dir=extract_image_block_output_dir,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
**kwargs,
|
||||
)
|
||||
elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF):
|
||||
@ -430,6 +431,10 @@ def partition(
|
||||
strategy=strategy,
|
||||
languages=languages,
|
||||
hi_res_model_name=hi_res_model_name or model_name,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_output_dir=extract_image_block_output_dir,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
**kwargs,
|
||||
)
|
||||
elif filetype == FileType.TXT:
|
||||
|
||||
@ -27,9 +27,9 @@ def partition_image(
|
||||
chunking_strategy: Optional[str] = None,
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
extract_images_in_pdf: bool = False,
|
||||
extract_element_types: Optional[List[str]] = None,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
extract_to_payload: bool = False,
|
||||
extract_image_block_types: Optional[List[str]] = None,
|
||||
extract_image_block_output_dir: Optional[str] = None,
|
||||
extract_image_block_to_payload: bool = False,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses an image into a list of interpreted elements.
|
||||
@ -64,25 +64,26 @@ def partition_image(
|
||||
The layout detection model used when partitioning strategy is set to `hi_res`.
|
||||
extract_images_in_pdf
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
|
||||
or stored as base64 encoded data within metadata fields.
|
||||
If True, any detected images will be saved in the path specified by
|
||||
'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
|
||||
Deprecation Note: This parameter is marked for deprecation. Future versions will use
|
||||
'extract_element_types' for broader extraction capabilities.
|
||||
extract_element_types
|
||||
'extract_image_block_types' for broader extraction capabilities.
|
||||
extract_image_block_types
|
||||
Only applicable if `strategy=hi_res`.
|
||||
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
|
||||
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
|
||||
within metadata fields.
|
||||
extract_to_payload
|
||||
saved in the path specified by 'extract_image_block_output_dir' or stored as base64 encoded
|
||||
data within metadata fields.
|
||||
extract_image_block_to_payload
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
|
||||
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
|
||||
If True, images of the element type(s) defined in 'extract_image_block_types' will be
|
||||
encoded as base64 data and stored in two metadata fields: 'image_base64' and
|
||||
'image_mime_type'.
|
||||
This parameter facilitates the inclusion of element data directly within the payload,
|
||||
especially for web-based applications or APIs.
|
||||
image_output_dir_path
|
||||
Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
|
||||
extract_image_block_output_dir
|
||||
Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
|
||||
The filesystem path for saving images of the element type(s)
|
||||
specified in 'extract_element_types'.
|
||||
specified in 'extract_image_block_types'.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
@ -119,8 +120,8 @@ def partition_image(
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_element_types=extract_element_types,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
extract_to_payload=extract_to_payload,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_output_dir=extract_image_block_output_dir,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -141,9 +141,9 @@ def partition_pdf(
|
||||
links: Sequence[Link] = [],
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
extract_images_in_pdf: bool = False,
|
||||
extract_element_types: Optional[List[str]] = None,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
extract_to_payload: bool = False,
|
||||
extract_image_block_types: Optional[List[str]] = None,
|
||||
extract_image_block_output_dir: Optional[str] = None,
|
||||
extract_image_block_to_payload: bool = False,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf document into a list of interpreted elements.
|
||||
@ -177,25 +177,26 @@ def partition_pdf(
|
||||
The layout detection model used when partitioning strategy is set to `hi_res`.
|
||||
extract_images_in_pdf
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
|
||||
or stored as base64 encoded data within metadata fields.
|
||||
If True, any detected images will be saved in the path specified by
|
||||
'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
|
||||
Deprecation Note: This parameter is marked for deprecation. Future versions will use
|
||||
'extract_element_types' for broader extraction capabilities.
|
||||
extract_element_types
|
||||
'extract_image_block_types' for broader extraction capabilities.
|
||||
extract_image_block_types
|
||||
Only applicable if `strategy=hi_res`.
|
||||
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
|
||||
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
|
||||
within metadata fields.
|
||||
extract_to_payload
|
||||
saved in the path specified by 'extract_image_block_output_dir' or stored as base64
|
||||
encoded data within metadata fields.
|
||||
extract_image_block_to_payload
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
|
||||
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
|
||||
If True, images of the element type(s) defined in 'extract_image_block_types' will be
|
||||
encoded as base64 data and stored in two metadata fields: 'image_base64' and
|
||||
'image_mime_type'.
|
||||
This parameter facilitates the inclusion of element data directly within the payload,
|
||||
especially for web-based applications or APIs.
|
||||
image_output_dir_path
|
||||
Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
|
||||
extract_image_block_output_dir
|
||||
Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
|
||||
The filesystem path for saving images of the element type(s)
|
||||
specified in 'extract_element_types'.
|
||||
specified in 'extract_image_block_types'.
|
||||
"""
|
||||
|
||||
exactly_one(filename=filename, file=file)
|
||||
@ -212,9 +213,9 @@ def partition_pdf(
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_element_types=extract_element_types,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
extract_to_payload=extract_to_payload,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_output_dir=extract_image_block_output_dir,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -266,9 +267,9 @@ def _partition_pdf_or_image_local(
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
pdf_text_extractable: bool = False,
|
||||
extract_images_in_pdf: bool = False,
|
||||
extract_element_types: Optional[List[str]] = None,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
extract_to_payload: bool = False,
|
||||
extract_image_block_types: Optional[List[str]] = None,
|
||||
extract_image_block_output_dir: Optional[str] = None,
|
||||
extract_image_block_to_payload: bool = False,
|
||||
analysis: bool = False,
|
||||
analyzed_image_output_dir_path: Optional[str] = None,
|
||||
**kwargs,
|
||||
@ -406,7 +407,7 @@ def _partition_pdf_or_image_local(
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
extract_element_types = check_element_types_to_extract(extract_element_types)
|
||||
extract_image_block_types = check_element_types_to_extract(extract_image_block_types)
|
||||
# NOTE(christine): `extract_images_in_pdf` would deprecate
|
||||
# (but continue to support for a while)
|
||||
if extract_images_in_pdf:
|
||||
@ -417,11 +418,11 @@ def _partition_pdf_or_image_local(
|
||||
file=file,
|
||||
is_image=is_image,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
extract_to_payload=extract_to_payload,
|
||||
output_dir_path=image_output_dir_path,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
output_dir_path=extract_image_block_output_dir,
|
||||
)
|
||||
|
||||
for el_type in extract_element_types:
|
||||
for el_type in extract_image_block_types:
|
||||
if extract_images_in_pdf and el_type == ElementType.IMAGE:
|
||||
continue
|
||||
|
||||
@ -432,8 +433,8 @@ def _partition_pdf_or_image_local(
|
||||
file=file,
|
||||
is_image=is_image,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
extract_to_payload=extract_to_payload,
|
||||
output_dir_path=image_output_dir_path,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
output_dir_path=extract_image_block_output_dir,
|
||||
)
|
||||
|
||||
out_elements = []
|
||||
@ -444,7 +445,7 @@ def _partition_pdf_or_image_local(
|
||||
if isinstance(el, Image):
|
||||
if (
|
||||
not extract_images_in_pdf
|
||||
and ElementType.IMAGE not in extract_element_types
|
||||
and ElementType.IMAGE not in extract_image_block_types
|
||||
and (el.text is None or len(el.text) < 24 or el.text.find(" ") == -1)
|
||||
):
|
||||
# NOTE(crag): small chunks of text from Image elements tend to be garbage
|
||||
@ -478,9 +479,9 @@ def partition_pdf_or_image(
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
extract_images_in_pdf: bool = False,
|
||||
extract_element_types: Optional[List[str]] = None,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
extract_to_payload: bool = False,
|
||||
extract_image_block_types: Optional[List[str]] = None,
|
||||
extract_image_block_output_dir: Optional[str] = None,
|
||||
extract_image_block_to_payload: bool = False,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||
@ -523,7 +524,7 @@ def partition_pdf_or_image(
|
||||
pdf_text_extractable=pdf_text_extractable,
|
||||
infer_table_structure=infer_table_structure,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_element_types=extract_element_types,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
)
|
||||
|
||||
if file is not None:
|
||||
@ -544,9 +545,9 @@ def partition_pdf_or_image(
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
pdf_text_extractable=pdf_text_extractable,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_element_types=extract_element_types,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
extract_to_payload=extract_to_payload,
|
||||
extract_image_block_types=extract_image_block_types,
|
||||
extract_image_block_output_dir=extract_image_block_output_dir,
|
||||
extract_image_block_to_payload=extract_image_block_to_payload,
|
||||
**kwargs,
|
||||
)
|
||||
out_elements = _process_uncategorized_text_elements(elements)
|
||||
|
||||
@ -82,7 +82,7 @@ def save_elements(
|
||||
filename: str = "",
|
||||
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||
is_image: bool = False,
|
||||
extract_to_payload: bool = False,
|
||||
extract_image_block_to_payload: bool = False,
|
||||
output_dir_path: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
@ -143,7 +143,7 @@ def save_elements(
|
||||
image_path = image_paths[page_number - 1]
|
||||
image = Image.open(image_path)
|
||||
cropped_image = image.crop((x1, y1, x2, y2))
|
||||
if extract_to_payload:
|
||||
if extract_image_block_to_payload:
|
||||
buffered = BytesIO()
|
||||
cropped_image.save(buffered, format="JPEG")
|
||||
img_base64 = base64.b64encode(buffered.getvalue())
|
||||
@ -159,28 +159,28 @@ def save_elements(
|
||||
|
||||
|
||||
def check_element_types_to_extract(
|
||||
extract_element_types: Optional[List[str]],
|
||||
extract_image_block_types: Optional[List[str]],
|
||||
) -> List[str]:
|
||||
"""Check and normalize the provided list of element types to extract."""
|
||||
|
||||
if extract_element_types is None:
|
||||
if extract_image_block_types is None:
|
||||
return []
|
||||
|
||||
if not isinstance(extract_element_types, list):
|
||||
if not isinstance(extract_image_block_types, list):
|
||||
raise TypeError(
|
||||
"The extract_element_types parameter must be a list of element types as strings, "
|
||||
"The extract_image_block_types parameter must be a list of element types as strings, "
|
||||
"ex. ['Table', 'Image']",
|
||||
)
|
||||
|
||||
available_element_types = list(ElementType.to_dict().values())
|
||||
normalized_extract_element_types = []
|
||||
for el_type in extract_element_types:
|
||||
normalized_extract_image_block_types = []
|
||||
for el_type in extract_image_block_types:
|
||||
normalized_el_type = el_type.lower().capitalize()
|
||||
if normalized_el_type not in available_element_types:
|
||||
logger.warning(f"The requested type ({el_type}) doesn't match any available type")
|
||||
normalized_extract_element_types.append(normalized_el_type)
|
||||
normalized_extract_image_block_types.append(normalized_el_type)
|
||||
|
||||
return normalized_extract_element_types
|
||||
return normalized_extract_image_block_types
|
||||
|
||||
|
||||
def valid_text(text: str) -> bool:
|
||||
|
||||
@ -27,7 +27,7 @@ def determine_pdf_or_image_strategy(
|
||||
pdf_text_extractable: bool = False,
|
||||
infer_table_structure: bool = False,
|
||||
extract_images_in_pdf: bool = False,
|
||||
extract_element_types: Optional[List[str]] = None,
|
||||
extract_image_block_types: Optional[List[str]] = None,
|
||||
):
|
||||
"""Determines what strategy to use for processing PDFs or images, accounting for fallback
|
||||
logic if some dependencies are not available."""
|
||||
@ -35,7 +35,7 @@ def determine_pdf_or_image_strategy(
|
||||
unstructured_inference_installed = dependency_exists("unstructured_inference")
|
||||
|
||||
if strategy == PartitionStrategy.AUTO:
|
||||
extract_element = extract_images_in_pdf or bool(extract_element_types)
|
||||
extract_element = extract_images_in_pdf or bool(extract_image_block_types)
|
||||
if is_image:
|
||||
strategy = _determine_image_auto_strategy()
|
||||
else:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user