mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 11:03:38 +00:00
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302. ### Summary - add functionality to get a Base64 encoded string from a PIL image - store base64 encoded image data in two metadata fields: `image_base64` and `image_mime_type` - update the "image element filter" logic to keep all image elements in the output if a user specifies image extraction ### Testing ``` from unstructured.partition.pdf import partition_pdf elements = partition_pdf( filename="example-docs/embedded-images-tables.pdf", strategy="hi_res", extract_element_types=["Image", "Table"], extract_to_payload=True, ) ``` or ``` from unstructured.partition.auto import partition elements = partition( filename="example-docs/embedded-images-tables.pdf", strategy="hi_res", pdf_extract_element_types=["Image", "Table"], pdf_extract_to_payload=True, ) ```
This commit is contained in:
parent
8ba9fadf8a
commit
dd144456de
@ -6,6 +6,8 @@
|
||||
* **Update encoders to leverage dataclasses** All encoders now follow a class approach which get annotated with the dataclass decorator. Similar to the connectors, it uses a nested dataclass for the configs required to configure a client as well as a field/property approach to cache the client. This makes sure any variable associated with the class exists as a dataclass field.
|
||||
|
||||
### Features
|
||||
|
||||
* **Store base64 encoded image data in metadata fields.** Rather than saving to file, stores base64 encoded data of the image bytes and the mimetype for the image in metadata fields: `image_base64` and `image_mime_type` (if that is what the user specifies by some other param like `pdf_extract_to_payload`). This would allow the API to have parity with the library.
|
||||
|
||||
### Fixes
|
||||
|
||||
|
||||
BIN
example-docs/embedded-images-tables.jpg
Normal file
BIN
example-docs/embedded-images-tables.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 251 KiB |
944
example-docs/embedded-images-tables.pdf
Normal file
944
example-docs/embedded-images-tables.pdf
Normal file
File diff suppressed because one or more lines are too long
@ -1,5 +1,6 @@
|
||||
import os
|
||||
import pathlib
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
@ -7,6 +8,7 @@ from PIL import Image
|
||||
from pytesseract import TesseractError
|
||||
from unstructured_inference.inference import layout
|
||||
|
||||
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import ElementType
|
||||
@ -632,3 +634,34 @@ def test_partition_image_has_filename(inference_results):
|
||||
assert element.metadata.filetype == "JPEG"
|
||||
# This should be kept from the filename we originally gave
|
||||
assert element.metadata.filename == filename
|
||||
|
||||
|
||||
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
||||
@pytest.mark.parametrize("extract_to_payload", [False, True])
|
||||
def test_partition_image_element_extraction(
|
||||
file_mode,
|
||||
extract_to_payload,
|
||||
filename=example_doc_path("embedded-images-tables.jpg"),
|
||||
):
|
||||
extract_element_types = ["Image", "Table"]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
if file_mode == "filename":
|
||||
elements = image.partition_image(
|
||||
filename=filename,
|
||||
strategy="hi_res",
|
||||
extract_element_types=extract_element_types,
|
||||
extract_to_payload=extract_to_payload,
|
||||
image_output_dir_path=tmpdir,
|
||||
)
|
||||
else:
|
||||
with open(filename, "rb") as f:
|
||||
elements = image.partition_image(
|
||||
file=f,
|
||||
strategy="hi_res",
|
||||
extract_element_types=extract_element_types,
|
||||
extract_to_payload=extract_to_payload,
|
||||
image_output_dir_path=tmpdir,
|
||||
)
|
||||
|
||||
assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
import base64
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import tempfile
|
||||
from tempfile import SpooledTemporaryFile
|
||||
from unittest import mock
|
||||
|
||||
@ -15,6 +17,7 @@ from unstructured.documents.coordinates import PixelSpace
|
||||
from unstructured.documents.elements import (
|
||||
CoordinatesMetadata,
|
||||
ElementMetadata,
|
||||
ElementType,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
Text,
|
||||
@ -1123,3 +1126,62 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
|
||||
caplog.set_level(logging.INFO)
|
||||
assert pdf.extractable_elements(filename=example_doc_path(filename))
|
||||
assert expected_log in caplog.text
|
||||
|
||||
|
||||
def assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir):
|
||||
extracted_elements = []
|
||||
for el_type in extract_element_types:
|
||||
extracted_elements_by_type = []
|
||||
for el in elements:
|
||||
if el.category == el_type:
|
||||
extracted_elements_by_type.append(el)
|
||||
extracted_elements.append(extracted_elements_by_type)
|
||||
|
||||
for extracted_elements_by_type in extracted_elements:
|
||||
for i, el in enumerate(extracted_elements_by_type):
|
||||
if extract_to_payload:
|
||||
assert el.metadata.image_base64 is not None
|
||||
assert el.metadata.image_mime_type == "image/jpeg"
|
||||
image_data = base64.b64decode(el.metadata.image_base64)
|
||||
assert isinstance(image_data, bytes)
|
||||
assert el.metadata.image_path is None
|
||||
else:
|
||||
basename = "table" if el.category == ElementType.TABLE else "figure"
|
||||
expected_image_path = os.path.join(
|
||||
str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
|
||||
)
|
||||
assert el.metadata.image_path == expected_image_path
|
||||
assert os.path.isfile(expected_image_path)
|
||||
assert el.metadata.image_base64 is None
|
||||
assert el.metadata.image_mime_type is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
||||
@pytest.mark.parametrize("extract_to_payload", [False, True])
|
||||
def test_partition_pdf_element_extraction(
|
||||
file_mode,
|
||||
extract_to_payload,
|
||||
filename=example_doc_path("embedded-images-tables.pdf"),
|
||||
):
|
||||
extract_element_types = ["Image", "Table"]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
if file_mode == "filename":
|
||||
elements = pdf.partition_pdf(
|
||||
filename=filename,
|
||||
strategy="hi_res",
|
||||
extract_element_types=extract_element_types,
|
||||
extract_to_payload=extract_to_payload,
|
||||
image_output_dir_path=tmpdir,
|
||||
)
|
||||
else:
|
||||
with open(filename, "rb") as f:
|
||||
elements = pdf.partition_pdf(
|
||||
file=f,
|
||||
strategy="hi_res",
|
||||
extract_element_types=extract_element_types,
|
||||
extract_to_payload=extract_to_payload,
|
||||
image_output_dir_path=tmpdir,
|
||||
)
|
||||
|
||||
assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
|
||||
|
||||
@ -7,7 +7,7 @@ from PIL import Image as PILImg
|
||||
|
||||
from test_unstructured.unit_utils import example_doc_path
|
||||
from unstructured.documents.coordinates import PixelSpace
|
||||
from unstructured.documents.elements import ElementMetadata, ElementType, Image
|
||||
from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table
|
||||
from unstructured.partition.pdf_image import pdf_image_utils
|
||||
|
||||
|
||||
@ -60,58 +60,66 @@ def test_convert_pdf_to_image(
|
||||
assert isinstance(images[0], PILImg.Image)
|
||||
|
||||
|
||||
def test_save_elements(filename=example_doc_path("embedded-images.pdf")):
|
||||
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
|
||||
@pytest.mark.parametrize("extract_to_payload", [False, True])
|
||||
def test_save_elements(
|
||||
element_category_to_save,
|
||||
extract_to_payload,
|
||||
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
||||
):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
elements = [
|
||||
Image(
|
||||
text="3",
|
||||
coordinates=(
|
||||
(78.7401411111111, 86.61545694444455),
|
||||
(78.7401411111111, 519.9487805555556),
|
||||
(512.0734647222223, 519.9487805555556),
|
||||
(512.0734647222223, 86.61545694444455),
|
||||
),
|
||||
coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
|
||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||
metadata=ElementMetadata(page_number=1),
|
||||
),
|
||||
Image(
|
||||
text="4",
|
||||
coordinates=(
|
||||
(570.8661397222222, 86.6154566666667),
|
||||
(570.8661397222222, 519.6862825000001),
|
||||
(1003.9369655555556, 519.6862825000001),
|
||||
(1003.9369655555556, 86.6154566666667),
|
||||
),
|
||||
coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
|
||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||
metadata=ElementMetadata(page_number=1),
|
||||
),
|
||||
Image(
|
||||
text="5",
|
||||
coordinates=(
|
||||
(1062.9921808333331, 86.61545694444455),
|
||||
(1062.9921808333331, 519.9487805555556),
|
||||
(1496.3255044444445, 519.9487805555556),
|
||||
(1496.3255044444445, 86.61545694444455),
|
||||
),
|
||||
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
|
||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||
metadata=ElementMetadata(page_number=1),
|
||||
),
|
||||
Table(
|
||||
text="Sample Table",
|
||||
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
|
||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||
metadata=ElementMetadata(page_number=2),
|
||||
),
|
||||
]
|
||||
|
||||
pdf_image_utils.save_elements(
|
||||
elements=elements,
|
||||
element_category_to_save=ElementType.IMAGE,
|
||||
element_category_to_save=element_category_to_save,
|
||||
pdf_image_dpi=200,
|
||||
filename=filename,
|
||||
output_dir_path=str(tmpdir),
|
||||
extract_to_payload=extract_to_payload,
|
||||
)
|
||||
|
||||
for i, el in enumerate(elements):
|
||||
saved_elements = [el for el in elements if el.category == element_category_to_save]
|
||||
for i, el in enumerate(saved_elements):
|
||||
basename = "table" if el.category == ElementType.TABLE else "figure"
|
||||
expected_image_path = os.path.join(
|
||||
str(tmpdir), f"figure-{el.metadata.page_number}-{i + 1}.jpg"
|
||||
str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
|
||||
)
|
||||
assert os.path.isfile(el.metadata.image_path)
|
||||
assert el.metadata.image_path == expected_image_path
|
||||
if extract_to_payload:
|
||||
assert isinstance(el.metadata.image_base64, str)
|
||||
assert isinstance(el.metadata.image_mime_type, str)
|
||||
assert not el.metadata.image_path
|
||||
assert not os.path.isfile(expected_image_path)
|
||||
else:
|
||||
assert os.path.isfile(expected_image_path)
|
||||
assert el.metadata.image_path == expected_image_path
|
||||
assert not el.metadata.image_base64
|
||||
assert not el.metadata.image_mime_type
|
||||
|
||||
|
||||
def test_write_image_raises_error():
|
||||
|
||||
@ -3,7 +3,7 @@ import os
|
||||
import pathlib
|
||||
import warnings
|
||||
from importlib import import_module
|
||||
from unittest.mock import ANY, Mock, patch
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import docx
|
||||
import pytest
|
||||
@ -347,15 +347,17 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
|
||||
|
||||
mock_partition.assert_called_once_with(
|
||||
filename=filename,
|
||||
metadata_filename=None,
|
||||
file=None,
|
||||
url=None,
|
||||
include_page_breaks=False,
|
||||
infer_table_structure=False,
|
||||
extract_images_in_pdf=ANY,
|
||||
image_output_dir_path=ANY,
|
||||
strategy=PartitionStrategy.FAST,
|
||||
languages=None,
|
||||
metadata_filename=None,
|
||||
include_page_breaks=False,
|
||||
infer_table_structure=False,
|
||||
extract_images_in_pdf=False,
|
||||
extract_element_types=None,
|
||||
image_output_dir_path=None,
|
||||
extract_to_payload=False,
|
||||
hi_res_model_name=None,
|
||||
)
|
||||
|
||||
|
||||
@ -173,9 +173,11 @@ class ElementMetadata:
|
||||
file_directory: Optional[str]
|
||||
filename: Optional[str]
|
||||
filetype: Optional[str]
|
||||
image_path: Optional[str]
|
||||
image_base64: Optional[str]
|
||||
image_mime_type: Optional[str]
|
||||
# -- specific to DOCX which has distinct primary, first-page, and even-page header/footers --
|
||||
header_footer_type: Optional[str]
|
||||
image_path: Optional[str]
|
||||
# -- used in chunks only, when chunk must be split mid-text to fit window --
|
||||
is_continuation: Optional[bool]
|
||||
languages: Optional[List[str]]
|
||||
@ -457,6 +459,8 @@ class ConsolidationStrategy(enum.Enum):
|
||||
"filetype": cls.FIRST,
|
||||
"header_footer_type": cls.DROP,
|
||||
"image_path": cls.DROP,
|
||||
"image_base64": cls.DROP,
|
||||
"image_mime_type": cls.DROP,
|
||||
"is_continuation": cls.DROP, # -- not expected, added by chunking, not before --
|
||||
"languages": cls.LIST_UNIQUE,
|
||||
"last_modified": cls.FIRST,
|
||||
|
||||
@ -137,7 +137,9 @@ def partition(
|
||||
detect_language_per_element: bool = False,
|
||||
pdf_infer_table_structure: bool = False,
|
||||
pdf_extract_images: bool = False,
|
||||
pdf_extract_element_types: Optional[List[str]] = None,
|
||||
pdf_image_output_dir_path: Optional[str] = None,
|
||||
pdf_extract_to_payload: bool = False,
|
||||
xml_keep_tags: bool = False,
|
||||
data_source_metadata: Optional[DataSourceMetadata] = None,
|
||||
metadata_filename: Optional[str] = None,
|
||||
@ -193,11 +195,26 @@ def partition(
|
||||
transformation of the data into an HTML <table>.
|
||||
The "text" field for a partitioned Table Element is always present, whether True or False.
|
||||
pdf_extract_images
|
||||
If True and strategy=hi_res, any detected images will be saved in the path specified by
|
||||
pdf_image_output_dir_path.
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
|
||||
or stored as base64 encoded data within metadata fields.
|
||||
Deprecation Note: This parameter is marked for deprecation. Future versions will use
|
||||
'extract_element_types' for broader extraction capabilities.
|
||||
pdf_extract_element_types
|
||||
Only applicable if `strategy=hi_res`.
|
||||
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
|
||||
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
|
||||
within metadata fields.
|
||||
pdf_extract_to_payload
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
|
||||
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
|
||||
This parameter facilitates the inclusion of element data directly within the payload,
|
||||
especially for web-based applications or APIs.
|
||||
pdf_image_output_dir_path
|
||||
If pdf_extract_images=True and strategy=hi_res, any detected images will be saved in the
|
||||
given path
|
||||
Only applicable if `strategy=hi_res` and `pdf_extract_to_payload=False`.
|
||||
The filesystem path for saving images of the element type(s)
|
||||
specified in 'extract_element_types'.
|
||||
xml_keep_tags
|
||||
If True, will retain the XML tags in the output. Otherwise it will simply extract
|
||||
the text from within the tags. Only applies to partition_xml.
|
||||
@ -397,7 +414,9 @@ def partition(
|
||||
strategy=strategy,
|
||||
languages=languages,
|
||||
extract_images_in_pdf=pdf_extract_images,
|
||||
extract_element_types=pdf_extract_element_types,
|
||||
image_output_dir_path=pdf_image_output_dir_path,
|
||||
extract_to_payload=pdf_extract_to_payload,
|
||||
hi_res_model_name=hi_res_model_name or model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -26,6 +26,10 @@ def partition_image(
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
chunking_strategy: Optional[str] = None,
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
extract_images_in_pdf: bool = False,
|
||||
extract_element_types: Optional[List[str]] = None,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
extract_to_payload: bool = False,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses an image into a list of interpreted elements.
|
||||
@ -58,6 +62,27 @@ def partition_image(
|
||||
The last modified date for the document.
|
||||
hi_res_model_name
|
||||
The layout detection model used when partitioning strategy is set to `hi_res`.
|
||||
extract_images_in_pdf
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
|
||||
or stored as base64 encoded data within metadata fields.
|
||||
Deprecation Note: This parameter is marked for deprecation. Future versions will use
|
||||
'extract_element_types' for broader extraction capabilities.
|
||||
extract_element_types
|
||||
Only applicable if `strategy=hi_res`.
|
||||
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
|
||||
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
|
||||
within metadata fields.
|
||||
extract_to_payload
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
|
||||
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
|
||||
This parameter facilitates the inclusion of element data directly within the payload,
|
||||
especially for web-based applications or APIs.
|
||||
image_output_dir_path
|
||||
Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
|
||||
The filesystem path for saving images of the element type(s)
|
||||
specified in 'extract_element_types'.
|
||||
"""
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
@ -93,5 +118,9 @@ def partition_image(
|
||||
strategy=strategy,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_element_types=extract_element_types,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
extract_to_payload=extract_to_payload,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -106,7 +106,6 @@ from unstructured.utils import requires_dependencies
|
||||
if TYPE_CHECKING:
|
||||
pass
|
||||
|
||||
|
||||
# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
|
||||
# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
|
||||
psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore
|
||||
@ -140,10 +139,11 @@ def partition_pdf(
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
chunking_strategy: Optional[str] = None, # used by decorator
|
||||
links: Sequence[Link] = [],
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
extract_images_in_pdf: bool = False,
|
||||
extract_element_types: Optional[List[str]] = None,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
extract_to_payload: bool = False,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf document into a list of interpreted elements.
|
||||
@ -173,18 +173,29 @@ def partition_pdf(
|
||||
with Tesseract, you'll first need to install the appropriate Tesseract language pack.
|
||||
metadata_last_modified
|
||||
The last modified date for the document.
|
||||
extract_images_in_pdf
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If `True`, any detected images will be saved in the path specified by
|
||||
image_output_dir_path.
|
||||
extract_element_types
|
||||
Only applicable if `strategy=hi_res`.
|
||||
Images of the element type(s) defined in this list will be saved to `image_output_dir_path`.
|
||||
image_output_dir_path
|
||||
Only applicable if `strategy=hi_res`.
|
||||
The path for saving images when using `extract_images_in_pdf` or `extract_element_types`.
|
||||
hi_res_model_name
|
||||
The layout detection model used when partitioning strategy is set to `hi_res`.
|
||||
extract_images_in_pdf
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
|
||||
or stored as base64 encoded data within metadata fields.
|
||||
Deprecation Note: This parameter is marked for deprecation. Future versions will use
|
||||
'extract_element_types' for broader extraction capabilities.
|
||||
extract_element_types
|
||||
Only applicable if `strategy=hi_res`.
|
||||
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
|
||||
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
|
||||
within metadata fields.
|
||||
extract_to_payload
|
||||
Only applicable if `strategy=hi_res`.
|
||||
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
|
||||
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
|
||||
This parameter facilitates the inclusion of element data directly within the payload,
|
||||
especially for web-based applications or APIs.
|
||||
image_output_dir_path
|
||||
Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
|
||||
The filesystem path for saving images of the element type(s)
|
||||
specified in 'extract_element_types'.
|
||||
"""
|
||||
|
||||
exactly_one(filename=filename, file=file)
|
||||
@ -199,10 +210,11 @@ def partition_pdf(
|
||||
infer_table_structure=infer_table_structure,
|
||||
languages=languages,
|
||||
metadata_last_modified=metadata_last_modified,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_element_types=extract_element_types,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
extract_to_payload=extract_to_payload,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@ -249,13 +261,14 @@ def _partition_pdf_or_image_local(
|
||||
languages: Optional[List[str]] = None,
|
||||
ocr_mode: str = OCRMode.FULL_PAGE.value,
|
||||
model_name: Optional[str] = None, # to be deprecated in favor of `hi_res_model_name`
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
pdf_image_dpi: Optional[int] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
pdf_text_extractable: bool = False,
|
||||
extract_images_in_pdf: bool = False,
|
||||
extract_element_types: Optional[List[str]] = None,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
pdf_image_dpi: Optional[int] = None,
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
extract_to_payload: bool = False,
|
||||
analysis: bool = False,
|
||||
analyzed_image_output_dir_path: Optional[str] = None,
|
||||
**kwargs,
|
||||
@ -402,7 +415,9 @@ def _partition_pdf_or_image_local(
|
||||
element_category_to_save=ElementType.IMAGE,
|
||||
filename=filename,
|
||||
file=file,
|
||||
is_image=is_image,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
extract_to_payload=extract_to_payload,
|
||||
output_dir_path=image_output_dir_path,
|
||||
)
|
||||
|
||||
@ -415,7 +430,9 @@ def _partition_pdf_or_image_local(
|
||||
element_category_to_save=el_type,
|
||||
filename=filename,
|
||||
file=file,
|
||||
is_image=is_image,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
extract_to_payload=extract_to_payload,
|
||||
output_dir_path=image_output_dir_path,
|
||||
)
|
||||
|
||||
@ -425,10 +442,12 @@ def _partition_pdf_or_image_local(
|
||||
continue
|
||||
|
||||
if isinstance(el, Image):
|
||||
# NOTE(crag): small chunks of text from Image elements tend to be garbage
|
||||
if not el.metadata.image_path and (
|
||||
el.text is None or len(el.text) < 24 or el.text.find(" ") == -1
|
||||
if (
|
||||
not extract_images_in_pdf
|
||||
and ElementType.IMAGE not in extract_element_types
|
||||
and (el.text is None or len(el.text) < 24 or el.text.find(" ") == -1)
|
||||
):
|
||||
# NOTE(crag): small chunks of text from Image elements tend to be garbage
|
||||
continue
|
||||
else:
|
||||
out_elements.append(cast(Element, el))
|
||||
@ -457,10 +476,11 @@ def partition_pdf_or_image(
|
||||
ocr_languages: Optional[str] = None,
|
||||
languages: Optional[List[str]] = None,
|
||||
metadata_last_modified: Optional[str] = None,
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
extract_images_in_pdf: bool = False,
|
||||
extract_element_types: Optional[List[str]] = None,
|
||||
image_output_dir_path: Optional[str] = None,
|
||||
hi_res_model_name: Optional[str] = None,
|
||||
extract_to_payload: bool = False,
|
||||
**kwargs,
|
||||
) -> List[Element]:
|
||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||
@ -518,11 +538,12 @@ def partition_pdf_or_image(
|
||||
include_page_breaks=include_page_breaks,
|
||||
languages=languages,
|
||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
pdf_text_extractable=pdf_text_extractable,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
extract_element_types=extract_element_types,
|
||||
image_output_dir_path=image_output_dir_path,
|
||||
hi_res_model_name=hi_res_model_name,
|
||||
extract_to_payload=extract_to_payload,
|
||||
**kwargs,
|
||||
)
|
||||
out_elements = _process_uncategorized_text_elements(elements)
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
import base64
|
||||
import os
|
||||
import tempfile
|
||||
from io import BytesIO
|
||||
from pathlib import PurePath
|
||||
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
|
||||
|
||||
@ -79,11 +81,17 @@ def save_elements(
|
||||
pdf_image_dpi: int,
|
||||
filename: str = "",
|
||||
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||
is_image: bool = False,
|
||||
extract_to_payload: bool = False,
|
||||
output_dir_path: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Extract and save images from the page. This method iterates through the layout elements
|
||||
of the page, identifies image regions, and extracts and saves them as separate image files.
|
||||
Saves specific elements from a PDF as images either to a directory or embeds them in the
|
||||
element's payload.
|
||||
|
||||
This function processes a list of elements partitioned from a PDF file. For each element of
|
||||
a specified category, it extracts and saves the image. The images can either be saved to
|
||||
a specified directory or embedded into the element's payload as a base64-encoded string.
|
||||
"""
|
||||
|
||||
if not output_dir_path:
|
||||
@ -91,14 +99,25 @@ def save_elements(
|
||||
os.makedirs(output_dir_path, exist_ok=True)
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
_image_paths = convert_pdf_to_image(
|
||||
filename,
|
||||
file,
|
||||
pdf_image_dpi,
|
||||
output_folder=temp_dir,
|
||||
path_only=True,
|
||||
)
|
||||
image_paths = cast(List[str], _image_paths)
|
||||
if is_image:
|
||||
if file is None:
|
||||
image_paths = [filename]
|
||||
else:
|
||||
if hasattr(file, "seek"):
|
||||
file.seek(0)
|
||||
temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
|
||||
temp_file.write(file.read() if hasattr(file, "read") else file)
|
||||
temp_file.flush()
|
||||
image_paths = [temp_file.name]
|
||||
else:
|
||||
_image_paths = convert_pdf_to_image(
|
||||
filename,
|
||||
file,
|
||||
pdf_image_dpi,
|
||||
output_folder=temp_dir,
|
||||
path_only=True,
|
||||
)
|
||||
image_paths = cast(List[str], _image_paths)
|
||||
|
||||
figure_number = 0
|
||||
for el in elements:
|
||||
@ -124,9 +143,17 @@ def save_elements(
|
||||
image_path = image_paths[page_number - 1]
|
||||
image = Image.open(image_path)
|
||||
cropped_image = image.crop((x1, y1, x2, y2))
|
||||
write_image(cropped_image, output_f_path)
|
||||
# add image path to element metadata
|
||||
el.metadata.image_path = output_f_path
|
||||
if extract_to_payload:
|
||||
buffered = BytesIO()
|
||||
cropped_image.save(buffered, format="JPEG")
|
||||
img_base64 = base64.b64encode(buffered.getvalue())
|
||||
img_base64_str = img_base64.decode()
|
||||
el.metadata.image_base64 = img_base64_str
|
||||
el.metadata.image_mime_type = "image/jpeg"
|
||||
else:
|
||||
write_image(cropped_image, output_f_path)
|
||||
# add image path to element metadata
|
||||
el.metadata.image_path = output_f_path
|
||||
except (ValueError, IOError):
|
||||
logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user