Feat: return base64 encoded images for PDF's (#2310)

Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition

elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
This commit is contained in:
Christine Straub 2023-12-26 21:39:01 -08:00 committed by GitHub
parent 8ba9fadf8a
commit dd144456de
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 1220 additions and 69 deletions

View File

@ -6,6 +6,8 @@
* **Update encoders to leverage dataclasses** All encoders now follow a class approach which get annotated with the dataclass decorator. Similar to the connectors, it uses a nested dataclass for the configs required to configure a client as well as a field/property approach to cache the client. This makes sure any variable associated with the class exists as a dataclass field. * **Update encoders to leverage dataclasses** All encoders now follow a class approach which get annotated with the dataclass decorator. Similar to the connectors, it uses a nested dataclass for the configs required to configure a client as well as a field/property approach to cache the client. This makes sure any variable associated with the class exists as a dataclass field.
### Features ### Features
* **Store base64 encoded image data in metadata fields.** Rather than saving to file, stores base64 encoded data of the image bytes and the mimetype for the image in metadata fields: `image_base64` and `image_mime_type` (if that is what the user specifies by some other param like `pdf_extract_to_payload`). This would allow the API to have parity with the library.
### Fixes ### Fixes

Binary file not shown.

After

Width:  |  Height:  |  Size: 251 KiB

File diff suppressed because one or more lines are too long

View File

@ -1,5 +1,6 @@
import os import os
import pathlib import pathlib
import tempfile
from unittest import mock from unittest import mock
import pytest import pytest
@ -7,6 +8,7 @@ from PIL import Image
from pytesseract import TesseractError from pytesseract import TesseractError
from unstructured_inference.inference import layout from unstructured_inference.inference import layout
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import ElementType from unstructured.documents.elements import ElementType
@ -632,3 +634,34 @@ def test_partition_image_has_filename(inference_results):
assert element.metadata.filetype == "JPEG" assert element.metadata.filetype == "JPEG"
# This should be kept from the filename we originally gave # This should be kept from the filename we originally gave
assert element.metadata.filename == filename assert element.metadata.filename == filename
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("extract_to_payload", [False, True])
def test_partition_image_element_extraction(
file_mode,
extract_to_payload,
filename=example_doc_path("embedded-images-tables.jpg"),
):
extract_element_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
if file_mode == "filename":
elements = image.partition_image(
filename=filename,
strategy="hi_res",
extract_element_types=extract_element_types,
extract_to_payload=extract_to_payload,
image_output_dir_path=tmpdir,
)
else:
with open(filename, "rb") as f:
elements = image.partition_image(
file=f,
strategy="hi_res",
extract_element_types=extract_element_types,
extract_to_payload=extract_to_payload,
image_output_dir_path=tmpdir,
)
assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)

View File

@ -1,6 +1,8 @@
import base64
import logging import logging
import math import math
import os import os
import tempfile
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
from unittest import mock from unittest import mock
@ -15,6 +17,7 @@ from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import ( from unstructured.documents.elements import (
CoordinatesMetadata, CoordinatesMetadata,
ElementMetadata, ElementMetadata,
ElementType,
ListItem, ListItem,
NarrativeText, NarrativeText,
Text, Text,
@ -1123,3 +1126,62 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
caplog.set_level(logging.INFO) caplog.set_level(logging.INFO)
assert pdf.extractable_elements(filename=example_doc_path(filename)) assert pdf.extractable_elements(filename=example_doc_path(filename))
assert expected_log in caplog.text assert expected_log in caplog.text
def assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir):
extracted_elements = []
for el_type in extract_element_types:
extracted_elements_by_type = []
for el in elements:
if el.category == el_type:
extracted_elements_by_type.append(el)
extracted_elements.append(extracted_elements_by_type)
for extracted_elements_by_type in extracted_elements:
for i, el in enumerate(extracted_elements_by_type):
if extract_to_payload:
assert el.metadata.image_base64 is not None
assert el.metadata.image_mime_type == "image/jpeg"
image_data = base64.b64decode(el.metadata.image_base64)
assert isinstance(image_data, bytes)
assert el.metadata.image_path is None
else:
basename = "table" if el.category == ElementType.TABLE else "figure"
expected_image_path = os.path.join(
str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
)
assert el.metadata.image_path == expected_image_path
assert os.path.isfile(expected_image_path)
assert el.metadata.image_base64 is None
assert el.metadata.image_mime_type is None
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("extract_to_payload", [False, True])
def test_partition_pdf_element_extraction(
file_mode,
extract_to_payload,
filename=example_doc_path("embedded-images-tables.pdf"),
):
extract_element_types = ["Image", "Table"]
with tempfile.TemporaryDirectory() as tmpdir:
if file_mode == "filename":
elements = pdf.partition_pdf(
filename=filename,
strategy="hi_res",
extract_element_types=extract_element_types,
extract_to_payload=extract_to_payload,
image_output_dir_path=tmpdir,
)
else:
with open(filename, "rb") as f:
elements = pdf.partition_pdf(
file=f,
strategy="hi_res",
extract_element_types=extract_element_types,
extract_to_payload=extract_to_payload,
image_output_dir_path=tmpdir,
)
assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)

View File

@ -7,7 +7,7 @@ from PIL import Image as PILImg
from test_unstructured.unit_utils import example_doc_path from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.coordinates import PixelSpace from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import ElementMetadata, ElementType, Image from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table
from unstructured.partition.pdf_image import pdf_image_utils from unstructured.partition.pdf_image import pdf_image_utils
@ -60,58 +60,66 @@ def test_convert_pdf_to_image(
assert isinstance(images[0], PILImg.Image) assert isinstance(images[0], PILImg.Image)
def test_save_elements(filename=example_doc_path("embedded-images.pdf")): @pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
@pytest.mark.parametrize("extract_to_payload", [False, True])
def test_save_elements(
element_category_to_save,
extract_to_payload,
filename=example_doc_path("layout-parser-paper-fast.pdf"),
):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
elements = [ elements = [
Image( Image(
text="3", text="3",
coordinates=( coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
(78.7401411111111, 86.61545694444455),
(78.7401411111111, 519.9487805555556),
(512.0734647222223, 519.9487805555556),
(512.0734647222223, 86.61545694444455),
),
coordinate_system=PixelSpace(width=1575, height=1166), coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1), metadata=ElementMetadata(page_number=1),
), ),
Image( Image(
text="4", text="4",
coordinates=( coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
(570.8661397222222, 86.6154566666667),
(570.8661397222222, 519.6862825000001),
(1003.9369655555556, 519.6862825000001),
(1003.9369655555556, 86.6154566666667),
),
coordinate_system=PixelSpace(width=1575, height=1166), coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1), metadata=ElementMetadata(page_number=1),
), ),
Image( Image(
text="5", text="5",
coordinates=( coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
(1062.9921808333331, 86.61545694444455),
(1062.9921808333331, 519.9487805555556),
(1496.3255044444445, 519.9487805555556),
(1496.3255044444445, 86.61545694444455),
),
coordinate_system=PixelSpace(width=1575, height=1166), coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1), metadata=ElementMetadata(page_number=1),
), ),
Table(
text="Sample Table",
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=2),
),
] ]
pdf_image_utils.save_elements( pdf_image_utils.save_elements(
elements=elements, elements=elements,
element_category_to_save=ElementType.IMAGE, element_category_to_save=element_category_to_save,
pdf_image_dpi=200, pdf_image_dpi=200,
filename=filename, filename=filename,
output_dir_path=str(tmpdir), output_dir_path=str(tmpdir),
extract_to_payload=extract_to_payload,
) )
for i, el in enumerate(elements): saved_elements = [el for el in elements if el.category == element_category_to_save]
for i, el in enumerate(saved_elements):
basename = "table" if el.category == ElementType.TABLE else "figure"
expected_image_path = os.path.join( expected_image_path = os.path.join(
str(tmpdir), f"figure-{el.metadata.page_number}-{i + 1}.jpg" str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
) )
assert os.path.isfile(el.metadata.image_path) if extract_to_payload:
assert el.metadata.image_path == expected_image_path assert isinstance(el.metadata.image_base64, str)
assert isinstance(el.metadata.image_mime_type, str)
assert not el.metadata.image_path
assert not os.path.isfile(expected_image_path)
else:
assert os.path.isfile(expected_image_path)
assert el.metadata.image_path == expected_image_path
assert not el.metadata.image_base64
assert not el.metadata.image_mime_type
def test_write_image_raises_error(): def test_write_image_raises_error():

View File

@ -3,7 +3,7 @@ import os
import pathlib import pathlib
import warnings import warnings
from importlib import import_module from importlib import import_module
from unittest.mock import ANY, Mock, patch from unittest.mock import Mock, patch
import docx import docx
import pytest import pytest
@ -347,15 +347,17 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
mock_partition.assert_called_once_with( mock_partition.assert_called_once_with(
filename=filename, filename=filename,
metadata_filename=None,
file=None, file=None,
url=None, url=None,
include_page_breaks=False,
infer_table_structure=False,
extract_images_in_pdf=ANY,
image_output_dir_path=ANY,
strategy=PartitionStrategy.FAST, strategy=PartitionStrategy.FAST,
languages=None, languages=None,
metadata_filename=None,
include_page_breaks=False,
infer_table_structure=False,
extract_images_in_pdf=False,
extract_element_types=None,
image_output_dir_path=None,
extract_to_payload=False,
hi_res_model_name=None, hi_res_model_name=None,
) )

View File

@ -173,9 +173,11 @@ class ElementMetadata:
file_directory: Optional[str] file_directory: Optional[str]
filename: Optional[str] filename: Optional[str]
filetype: Optional[str] filetype: Optional[str]
image_path: Optional[str]
image_base64: Optional[str]
image_mime_type: Optional[str]
# -- specific to DOCX which has distinct primary, first-page, and even-page header/footers -- # -- specific to DOCX which has distinct primary, first-page, and even-page header/footers --
header_footer_type: Optional[str] header_footer_type: Optional[str]
image_path: Optional[str]
# -- used in chunks only, when chunk must be split mid-text to fit window -- # -- used in chunks only, when chunk must be split mid-text to fit window --
is_continuation: Optional[bool] is_continuation: Optional[bool]
languages: Optional[List[str]] languages: Optional[List[str]]
@ -457,6 +459,8 @@ class ConsolidationStrategy(enum.Enum):
"filetype": cls.FIRST, "filetype": cls.FIRST,
"header_footer_type": cls.DROP, "header_footer_type": cls.DROP,
"image_path": cls.DROP, "image_path": cls.DROP,
"image_base64": cls.DROP,
"image_mime_type": cls.DROP,
"is_continuation": cls.DROP, # -- not expected, added by chunking, not before -- "is_continuation": cls.DROP, # -- not expected, added by chunking, not before --
"languages": cls.LIST_UNIQUE, "languages": cls.LIST_UNIQUE,
"last_modified": cls.FIRST, "last_modified": cls.FIRST,

View File

@ -137,7 +137,9 @@ def partition(
detect_language_per_element: bool = False, detect_language_per_element: bool = False,
pdf_infer_table_structure: bool = False, pdf_infer_table_structure: bool = False,
pdf_extract_images: bool = False, pdf_extract_images: bool = False,
pdf_extract_element_types: Optional[List[str]] = None,
pdf_image_output_dir_path: Optional[str] = None, pdf_image_output_dir_path: Optional[str] = None,
pdf_extract_to_payload: bool = False,
xml_keep_tags: bool = False, xml_keep_tags: bool = False,
data_source_metadata: Optional[DataSourceMetadata] = None, data_source_metadata: Optional[DataSourceMetadata] = None,
metadata_filename: Optional[str] = None, metadata_filename: Optional[str] = None,
@ -193,11 +195,26 @@ def partition(
transformation of the data into an HTML <table>. transformation of the data into an HTML <table>.
The "text" field for a partitioned Table Element is always present, whether True or False. The "text" field for a partitioned Table Element is always present, whether True or False.
pdf_extract_images pdf_extract_images
If True and strategy=hi_res, any detected images will be saved in the path specified by Only applicable if `strategy=hi_res`.
pdf_image_output_dir_path. If True, any detected images will be saved in the path specified by 'image_output_dir_path'
or stored as base64 encoded data within metadata fields.
Deprecation Note: This parameter is marked for deprecation. Future versions will use
'extract_element_types' for broader extraction capabilities.
pdf_extract_element_types
Only applicable if `strategy=hi_res`.
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
within metadata fields.
pdf_extract_to_payload
Only applicable if `strategy=hi_res`.
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
This parameter facilitates the inclusion of element data directly within the payload,
especially for web-based applications or APIs.
pdf_image_output_dir_path pdf_image_output_dir_path
If pdf_extract_images=True and strategy=hi_res, any detected images will be saved in the Only applicable if `strategy=hi_res` and `pdf_extract_to_payload=False`.
given path The filesystem path for saving images of the element type(s)
specified in 'extract_element_types'.
xml_keep_tags xml_keep_tags
If True, will retain the XML tags in the output. Otherwise it will simply extract If True, will retain the XML tags in the output. Otherwise it will simply extract
the text from within the tags. Only applies to partition_xml. the text from within the tags. Only applies to partition_xml.
@ -397,7 +414,9 @@ def partition(
strategy=strategy, strategy=strategy,
languages=languages, languages=languages,
extract_images_in_pdf=pdf_extract_images, extract_images_in_pdf=pdf_extract_images,
extract_element_types=pdf_extract_element_types,
image_output_dir_path=pdf_image_output_dir_path, image_output_dir_path=pdf_image_output_dir_path,
extract_to_payload=pdf_extract_to_payload,
hi_res_model_name=hi_res_model_name or model_name, hi_res_model_name=hi_res_model_name or model_name,
**kwargs, **kwargs,
) )

View File

@ -26,6 +26,10 @@ def partition_image(
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
chunking_strategy: Optional[str] = None, chunking_strategy: Optional[str] = None,
hi_res_model_name: Optional[str] = None, hi_res_model_name: Optional[str] = None,
extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None,
extract_to_payload: bool = False,
**kwargs, **kwargs,
) -> List[Element]: ) -> List[Element]:
"""Parses an image into a list of interpreted elements. """Parses an image into a list of interpreted elements.
@ -58,6 +62,27 @@ def partition_image(
The last modified date for the document. The last modified date for the document.
hi_res_model_name hi_res_model_name
The layout detection model used when partitioning strategy is set to `hi_res`. The layout detection model used when partitioning strategy is set to `hi_res`.
extract_images_in_pdf
Only applicable if `strategy=hi_res`.
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
or stored as base64 encoded data within metadata fields.
Deprecation Note: This parameter is marked for deprecation. Future versions will use
'extract_element_types' for broader extraction capabilities.
extract_element_types
Only applicable if `strategy=hi_res`.
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
within metadata fields.
extract_to_payload
Only applicable if `strategy=hi_res`.
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
This parameter facilitates the inclusion of element data directly within the payload,
especially for web-based applications or APIs.
image_output_dir_path
Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
The filesystem path for saving images of the element type(s)
specified in 'extract_element_types'.
""" """
exactly_one(filename=filename, file=file) exactly_one(filename=filename, file=file)
@ -93,5 +118,9 @@ def partition_image(
strategy=strategy, strategy=strategy,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
hi_res_model_name=hi_res_model_name, hi_res_model_name=hi_res_model_name,
extract_images_in_pdf=extract_images_in_pdf,
extract_element_types=extract_element_types,
image_output_dir_path=image_output_dir_path,
extract_to_payload=extract_to_payload,
**kwargs, **kwargs,
) )

View File

@ -106,7 +106,6 @@ from unstructured.utils import requires_dependencies
if TYPE_CHECKING: if TYPE_CHECKING:
pass pass
# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix # NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
# the bug: https://github.com/pdfminer/pdfminer.six/pull/885 # the bug: https://github.com/pdfminer/pdfminer.six/pull/885
psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore
@ -140,10 +139,11 @@ def partition_pdf(
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
chunking_strategy: Optional[str] = None, # used by decorator chunking_strategy: Optional[str] = None, # used by decorator
links: Sequence[Link] = [], links: Sequence[Link] = [],
hi_res_model_name: Optional[str] = None,
extract_images_in_pdf: bool = False, extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None, extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None, image_output_dir_path: Optional[str] = None,
hi_res_model_name: Optional[str] = None, extract_to_payload: bool = False,
**kwargs, **kwargs,
) -> List[Element]: ) -> List[Element]:
"""Parses a pdf document into a list of interpreted elements. """Parses a pdf document into a list of interpreted elements.
@ -173,18 +173,29 @@ def partition_pdf(
with Tesseract, you'll first need to install the appropriate Tesseract language pack. with Tesseract, you'll first need to install the appropriate Tesseract language pack.
metadata_last_modified metadata_last_modified
The last modified date for the document. The last modified date for the document.
extract_images_in_pdf
Only applicable if `strategy=hi_res`.
If `True`, any detected images will be saved in the path specified by
image_output_dir_path.
extract_element_types
Only applicable if `strategy=hi_res`.
Images of the element type(s) defined in this list will be saved to `image_output_dir_path`.
image_output_dir_path
Only applicable if `strategy=hi_res`.
The path for saving images when using `extract_images_in_pdf` or `extract_element_types`.
hi_res_model_name hi_res_model_name
The layout detection model used when partitioning strategy is set to `hi_res`. The layout detection model used when partitioning strategy is set to `hi_res`.
extract_images_in_pdf
Only applicable if `strategy=hi_res`.
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
or stored as base64 encoded data within metadata fields.
Deprecation Note: This parameter is marked for deprecation. Future versions will use
'extract_element_types' for broader extraction capabilities.
extract_element_types
Only applicable if `strategy=hi_res`.
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
within metadata fields.
extract_to_payload
Only applicable if `strategy=hi_res`.
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
This parameter facilitates the inclusion of element data directly within the payload,
especially for web-based applications or APIs.
image_output_dir_path
Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
The filesystem path for saving images of the element type(s)
specified in 'extract_element_types'.
""" """
exactly_one(filename=filename, file=file) exactly_one(filename=filename, file=file)
@ -199,10 +210,11 @@ def partition_pdf(
infer_table_structure=infer_table_structure, infer_table_structure=infer_table_structure,
languages=languages, languages=languages,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
hi_res_model_name=hi_res_model_name,
extract_images_in_pdf=extract_images_in_pdf, extract_images_in_pdf=extract_images_in_pdf,
extract_element_types=extract_element_types, extract_element_types=extract_element_types,
image_output_dir_path=image_output_dir_path, image_output_dir_path=image_output_dir_path,
hi_res_model_name=hi_res_model_name, extract_to_payload=extract_to_payload,
**kwargs, **kwargs,
) )
@ -249,13 +261,14 @@ def _partition_pdf_or_image_local(
languages: Optional[List[str]] = None, languages: Optional[List[str]] = None,
ocr_mode: str = OCRMode.FULL_PAGE.value, ocr_mode: str = OCRMode.FULL_PAGE.value,
model_name: Optional[str] = None, # to be deprecated in favor of `hi_res_model_name` model_name: Optional[str] = None, # to be deprecated in favor of `hi_res_model_name`
hi_res_model_name: Optional[str] = None,
pdf_image_dpi: Optional[int] = None,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
pdf_text_extractable: bool = False, pdf_text_extractable: bool = False,
extract_images_in_pdf: bool = False, extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None, extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None, image_output_dir_path: Optional[str] = None,
pdf_image_dpi: Optional[int] = None, extract_to_payload: bool = False,
hi_res_model_name: Optional[str] = None,
analysis: bool = False, analysis: bool = False,
analyzed_image_output_dir_path: Optional[str] = None, analyzed_image_output_dir_path: Optional[str] = None,
**kwargs, **kwargs,
@ -402,7 +415,9 @@ def _partition_pdf_or_image_local(
element_category_to_save=ElementType.IMAGE, element_category_to_save=ElementType.IMAGE,
filename=filename, filename=filename,
file=file, file=file,
is_image=is_image,
pdf_image_dpi=pdf_image_dpi, pdf_image_dpi=pdf_image_dpi,
extract_to_payload=extract_to_payload,
output_dir_path=image_output_dir_path, output_dir_path=image_output_dir_path,
) )
@ -415,7 +430,9 @@ def _partition_pdf_or_image_local(
element_category_to_save=el_type, element_category_to_save=el_type,
filename=filename, filename=filename,
file=file, file=file,
is_image=is_image,
pdf_image_dpi=pdf_image_dpi, pdf_image_dpi=pdf_image_dpi,
extract_to_payload=extract_to_payload,
output_dir_path=image_output_dir_path, output_dir_path=image_output_dir_path,
) )
@ -425,10 +442,12 @@ def _partition_pdf_or_image_local(
continue continue
if isinstance(el, Image): if isinstance(el, Image):
# NOTE(crag): small chunks of text from Image elements tend to be garbage if (
if not el.metadata.image_path and ( not extract_images_in_pdf
el.text is None or len(el.text) < 24 or el.text.find(" ") == -1 and ElementType.IMAGE not in extract_element_types
and (el.text is None or len(el.text) < 24 or el.text.find(" ") == -1)
): ):
# NOTE(crag): small chunks of text from Image elements tend to be garbage
continue continue
else: else:
out_elements.append(cast(Element, el)) out_elements.append(cast(Element, el))
@ -457,10 +476,11 @@ def partition_pdf_or_image(
ocr_languages: Optional[str] = None, ocr_languages: Optional[str] = None,
languages: Optional[List[str]] = None, languages: Optional[List[str]] = None,
metadata_last_modified: Optional[str] = None, metadata_last_modified: Optional[str] = None,
hi_res_model_name: Optional[str] = None,
extract_images_in_pdf: bool = False, extract_images_in_pdf: bool = False,
extract_element_types: Optional[List[str]] = None, extract_element_types: Optional[List[str]] = None,
image_output_dir_path: Optional[str] = None, image_output_dir_path: Optional[str] = None,
hi_res_model_name: Optional[str] = None, extract_to_payload: bool = False,
**kwargs, **kwargs,
) -> List[Element]: ) -> List[Element]:
"""Parses a pdf or image document into a list of interpreted elements.""" """Parses a pdf or image document into a list of interpreted elements."""
@ -518,11 +538,12 @@ def partition_pdf_or_image(
include_page_breaks=include_page_breaks, include_page_breaks=include_page_breaks,
languages=languages, languages=languages,
metadata_last_modified=metadata_last_modified or last_modification_date, metadata_last_modified=metadata_last_modified or last_modification_date,
hi_res_model_name=hi_res_model_name,
pdf_text_extractable=pdf_text_extractable, pdf_text_extractable=pdf_text_extractable,
extract_images_in_pdf=extract_images_in_pdf, extract_images_in_pdf=extract_images_in_pdf,
extract_element_types=extract_element_types, extract_element_types=extract_element_types,
image_output_dir_path=image_output_dir_path, image_output_dir_path=image_output_dir_path,
hi_res_model_name=hi_res_model_name, extract_to_payload=extract_to_payload,
**kwargs, **kwargs,
) )
out_elements = _process_uncategorized_text_elements(elements) out_elements = _process_uncategorized_text_elements(elements)

View File

@ -1,5 +1,7 @@
import base64
import os import os
import tempfile import tempfile
from io import BytesIO
from pathlib import PurePath from pathlib import PurePath
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
@ -79,11 +81,17 @@ def save_elements(
pdf_image_dpi: int, pdf_image_dpi: int,
filename: str = "", filename: str = "",
file: Optional[Union[bytes, BinaryIO]] = None, file: Optional[Union[bytes, BinaryIO]] = None,
is_image: bool = False,
extract_to_payload: bool = False,
output_dir_path: Optional[str] = None, output_dir_path: Optional[str] = None,
): ):
""" """
Extract and save images from the page. This method iterates through the layout elements Saves specific elements from a PDF as images either to a directory or embeds them in the
of the page, identifies image regions, and extracts and saves them as separate image files. element's payload.
This function processes a list of elements partitioned from a PDF file. For each element of
a specified category, it extracts and saves the image. The images can either be saved to
a specified directory or embedded into the element's payload as a base64-encoded string.
""" """
if not output_dir_path: if not output_dir_path:
@ -91,14 +99,25 @@ def save_elements(
os.makedirs(output_dir_path, exist_ok=True) os.makedirs(output_dir_path, exist_ok=True)
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
_image_paths = convert_pdf_to_image( if is_image:
filename, if file is None:
file, image_paths = [filename]
pdf_image_dpi, else:
output_folder=temp_dir, if hasattr(file, "seek"):
path_only=True, file.seek(0)
) temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
image_paths = cast(List[str], _image_paths) temp_file.write(file.read() if hasattr(file, "read") else file)
temp_file.flush()
image_paths = [temp_file.name]
else:
_image_paths = convert_pdf_to_image(
filename,
file,
pdf_image_dpi,
output_folder=temp_dir,
path_only=True,
)
image_paths = cast(List[str], _image_paths)
figure_number = 0 figure_number = 0
for el in elements: for el in elements:
@ -124,9 +143,17 @@ def save_elements(
image_path = image_paths[page_number - 1] image_path = image_paths[page_number - 1]
image = Image.open(image_path) image = Image.open(image_path)
cropped_image = image.crop((x1, y1, x2, y2)) cropped_image = image.crop((x1, y1, x2, y2))
write_image(cropped_image, output_f_path) if extract_to_payload:
# add image path to element metadata buffered = BytesIO()
el.metadata.image_path = output_f_path cropped_image.save(buffered, format="JPEG")
img_base64 = base64.b64encode(buffered.getvalue())
img_base64_str = img_base64.decode()
el.metadata.image_base64 = img_base64_str
el.metadata.image_mime_type = "image/jpeg"
else:
write_image(cropped_image, output_f_path)
# add image path to element metadata
el.metadata.image_path = output_f_path
except (ValueError, IOError): except (ValueError, IOError):
logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True) logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)