mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-14 01:17:36 +00:00
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302. ### Summary - add functionality to get a Base64 encoded string from a PIL image - store base64 encoded image data in two metadata fields: `image_base64` and `image_mime_type` - update the "image element filter" logic to keep all image elements in the output if a user specifies image extraction ### Testing ``` from unstructured.partition.pdf import partition_pdf elements = partition_pdf( filename="example-docs/embedded-images-tables.pdf", strategy="hi_res", extract_element_types=["Image", "Table"], extract_to_payload=True, ) ``` or ``` from unstructured.partition.auto import partition elements = partition( filename="example-docs/embedded-images-tables.pdf", strategy="hi_res", pdf_extract_element_types=["Image", "Table"], pdf_extract_to_payload=True, ) ```
This commit is contained in:
parent
8ba9fadf8a
commit
dd144456de
@ -7,6 +7,8 @@
|
|||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
* **Store base64 encoded image data in metadata fields.** Rather than saving to file, stores base64 encoded data of the image bytes and the mimetype for the image in metadata fields: `image_base64` and `image_mime_type` (if that is what the user specifies by some other param like `pdf_extract_to_payload`). This would allow the API to have parity with the library.
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
## 0.11.6
|
## 0.11.6
|
||||||
|
|||||||
BIN
example-docs/embedded-images-tables.jpg
Normal file
BIN
example-docs/embedded-images-tables.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 251 KiB |
944
example-docs/embedded-images-tables.pdf
Normal file
944
example-docs/embedded-images-tables.pdf
Normal file
File diff suppressed because one or more lines are too long
@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
import tempfile
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@ -7,6 +8,7 @@ from PIL import Image
|
|||||||
from pytesseract import TesseractError
|
from pytesseract import TesseractError
|
||||||
from unstructured_inference.inference import layout
|
from unstructured_inference.inference import layout
|
||||||
|
|
||||||
|
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
|
||||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
from unstructured.documents.elements import ElementType
|
from unstructured.documents.elements import ElementType
|
||||||
@ -632,3 +634,34 @@ def test_partition_image_has_filename(inference_results):
|
|||||||
assert element.metadata.filetype == "JPEG"
|
assert element.metadata.filetype == "JPEG"
|
||||||
# This should be kept from the filename we originally gave
|
# This should be kept from the filename we originally gave
|
||||||
assert element.metadata.filename == filename
|
assert element.metadata.filename == filename
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
||||||
|
@pytest.mark.parametrize("extract_to_payload", [False, True])
|
||||||
|
def test_partition_image_element_extraction(
|
||||||
|
file_mode,
|
||||||
|
extract_to_payload,
|
||||||
|
filename=example_doc_path("embedded-images-tables.jpg"),
|
||||||
|
):
|
||||||
|
extract_element_types = ["Image", "Table"]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
if file_mode == "filename":
|
||||||
|
elements = image.partition_image(
|
||||||
|
filename=filename,
|
||||||
|
strategy="hi_res",
|
||||||
|
extract_element_types=extract_element_types,
|
||||||
|
extract_to_payload=extract_to_payload,
|
||||||
|
image_output_dir_path=tmpdir,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
elements = image.partition_image(
|
||||||
|
file=f,
|
||||||
|
strategy="hi_res",
|
||||||
|
extract_element_types=extract_element_types,
|
||||||
|
extract_to_payload=extract_to_payload,
|
||||||
|
image_output_dir_path=tmpdir,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
|
import base64
|
||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
|
import tempfile
|
||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
@ -15,6 +17,7 @@ from unstructured.documents.coordinates import PixelSpace
|
|||||||
from unstructured.documents.elements import (
|
from unstructured.documents.elements import (
|
||||||
CoordinatesMetadata,
|
CoordinatesMetadata,
|
||||||
ElementMetadata,
|
ElementMetadata,
|
||||||
|
ElementType,
|
||||||
ListItem,
|
ListItem,
|
||||||
NarrativeText,
|
NarrativeText,
|
||||||
Text,
|
Text,
|
||||||
@ -1123,3 +1126,62 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
|
|||||||
caplog.set_level(logging.INFO)
|
caplog.set_level(logging.INFO)
|
||||||
assert pdf.extractable_elements(filename=example_doc_path(filename))
|
assert pdf.extractable_elements(filename=example_doc_path(filename))
|
||||||
assert expected_log in caplog.text
|
assert expected_log in caplog.text
|
||||||
|
|
||||||
|
|
||||||
|
def assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir):
|
||||||
|
extracted_elements = []
|
||||||
|
for el_type in extract_element_types:
|
||||||
|
extracted_elements_by_type = []
|
||||||
|
for el in elements:
|
||||||
|
if el.category == el_type:
|
||||||
|
extracted_elements_by_type.append(el)
|
||||||
|
extracted_elements.append(extracted_elements_by_type)
|
||||||
|
|
||||||
|
for extracted_elements_by_type in extracted_elements:
|
||||||
|
for i, el in enumerate(extracted_elements_by_type):
|
||||||
|
if extract_to_payload:
|
||||||
|
assert el.metadata.image_base64 is not None
|
||||||
|
assert el.metadata.image_mime_type == "image/jpeg"
|
||||||
|
image_data = base64.b64decode(el.metadata.image_base64)
|
||||||
|
assert isinstance(image_data, bytes)
|
||||||
|
assert el.metadata.image_path is None
|
||||||
|
else:
|
||||||
|
basename = "table" if el.category == ElementType.TABLE else "figure"
|
||||||
|
expected_image_path = os.path.join(
|
||||||
|
str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
|
||||||
|
)
|
||||||
|
assert el.metadata.image_path == expected_image_path
|
||||||
|
assert os.path.isfile(expected_image_path)
|
||||||
|
assert el.metadata.image_base64 is None
|
||||||
|
assert el.metadata.image_mime_type is None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
||||||
|
@pytest.mark.parametrize("extract_to_payload", [False, True])
|
||||||
|
def test_partition_pdf_element_extraction(
|
||||||
|
file_mode,
|
||||||
|
extract_to_payload,
|
||||||
|
filename=example_doc_path("embedded-images-tables.pdf"),
|
||||||
|
):
|
||||||
|
extract_element_types = ["Image", "Table"]
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
if file_mode == "filename":
|
||||||
|
elements = pdf.partition_pdf(
|
||||||
|
filename=filename,
|
||||||
|
strategy="hi_res",
|
||||||
|
extract_element_types=extract_element_types,
|
||||||
|
extract_to_payload=extract_to_payload,
|
||||||
|
image_output_dir_path=tmpdir,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
elements = pdf.partition_pdf(
|
||||||
|
file=f,
|
||||||
|
strategy="hi_res",
|
||||||
|
extract_element_types=extract_element_types,
|
||||||
|
extract_to_payload=extract_to_payload,
|
||||||
|
image_output_dir_path=tmpdir,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
|
||||||
|
|||||||
@ -7,7 +7,7 @@ from PIL import Image as PILImg
|
|||||||
|
|
||||||
from test_unstructured.unit_utils import example_doc_path
|
from test_unstructured.unit_utils import example_doc_path
|
||||||
from unstructured.documents.coordinates import PixelSpace
|
from unstructured.documents.coordinates import PixelSpace
|
||||||
from unstructured.documents.elements import ElementMetadata, ElementType, Image
|
from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table
|
||||||
from unstructured.partition.pdf_image import pdf_image_utils
|
from unstructured.partition.pdf_image import pdf_image_utils
|
||||||
|
|
||||||
|
|
||||||
@ -60,58 +60,66 @@ def test_convert_pdf_to_image(
|
|||||||
assert isinstance(images[0], PILImg.Image)
|
assert isinstance(images[0], PILImg.Image)
|
||||||
|
|
||||||
|
|
||||||
def test_save_elements(filename=example_doc_path("embedded-images.pdf")):
|
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
|
||||||
|
@pytest.mark.parametrize("extract_to_payload", [False, True])
|
||||||
|
def test_save_elements(
|
||||||
|
element_category_to_save,
|
||||||
|
extract_to_payload,
|
||||||
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
||||||
|
):
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
elements = [
|
elements = [
|
||||||
Image(
|
Image(
|
||||||
text="3",
|
text="3",
|
||||||
coordinates=(
|
coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
|
||||||
(78.7401411111111, 86.61545694444455),
|
|
||||||
(78.7401411111111, 519.9487805555556),
|
|
||||||
(512.0734647222223, 519.9487805555556),
|
|
||||||
(512.0734647222223, 86.61545694444455),
|
|
||||||
),
|
|
||||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||||
metadata=ElementMetadata(page_number=1),
|
metadata=ElementMetadata(page_number=1),
|
||||||
),
|
),
|
||||||
Image(
|
Image(
|
||||||
text="4",
|
text="4",
|
||||||
coordinates=(
|
coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
|
||||||
(570.8661397222222, 86.6154566666667),
|
|
||||||
(570.8661397222222, 519.6862825000001),
|
|
||||||
(1003.9369655555556, 519.6862825000001),
|
|
||||||
(1003.9369655555556, 86.6154566666667),
|
|
||||||
),
|
|
||||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||||
metadata=ElementMetadata(page_number=1),
|
metadata=ElementMetadata(page_number=1),
|
||||||
),
|
),
|
||||||
Image(
|
Image(
|
||||||
text="5",
|
text="5",
|
||||||
coordinates=(
|
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
|
||||||
(1062.9921808333331, 86.61545694444455),
|
|
||||||
(1062.9921808333331, 519.9487805555556),
|
|
||||||
(1496.3255044444445, 519.9487805555556),
|
|
||||||
(1496.3255044444445, 86.61545694444455),
|
|
||||||
),
|
|
||||||
coordinate_system=PixelSpace(width=1575, height=1166),
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||||
metadata=ElementMetadata(page_number=1),
|
metadata=ElementMetadata(page_number=1),
|
||||||
),
|
),
|
||||||
|
Table(
|
||||||
|
text="Sample Table",
|
||||||
|
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
|
||||||
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
||||||
|
metadata=ElementMetadata(page_number=2),
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
pdf_image_utils.save_elements(
|
pdf_image_utils.save_elements(
|
||||||
elements=elements,
|
elements=elements,
|
||||||
element_category_to_save=ElementType.IMAGE,
|
element_category_to_save=element_category_to_save,
|
||||||
pdf_image_dpi=200,
|
pdf_image_dpi=200,
|
||||||
filename=filename,
|
filename=filename,
|
||||||
output_dir_path=str(tmpdir),
|
output_dir_path=str(tmpdir),
|
||||||
|
extract_to_payload=extract_to_payload,
|
||||||
)
|
)
|
||||||
|
|
||||||
for i, el in enumerate(elements):
|
saved_elements = [el for el in elements if el.category == element_category_to_save]
|
||||||
|
for i, el in enumerate(saved_elements):
|
||||||
|
basename = "table" if el.category == ElementType.TABLE else "figure"
|
||||||
expected_image_path = os.path.join(
|
expected_image_path = os.path.join(
|
||||||
str(tmpdir), f"figure-{el.metadata.page_number}-{i + 1}.jpg"
|
str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
|
||||||
)
|
)
|
||||||
assert os.path.isfile(el.metadata.image_path)
|
if extract_to_payload:
|
||||||
|
assert isinstance(el.metadata.image_base64, str)
|
||||||
|
assert isinstance(el.metadata.image_mime_type, str)
|
||||||
|
assert not el.metadata.image_path
|
||||||
|
assert not os.path.isfile(expected_image_path)
|
||||||
|
else:
|
||||||
|
assert os.path.isfile(expected_image_path)
|
||||||
assert el.metadata.image_path == expected_image_path
|
assert el.metadata.image_path == expected_image_path
|
||||||
|
assert not el.metadata.image_base64
|
||||||
|
assert not el.metadata.image_mime_type
|
||||||
|
|
||||||
|
|
||||||
def test_write_image_raises_error():
|
def test_write_image_raises_error():
|
||||||
|
|||||||
@ -3,7 +3,7 @@ import os
|
|||||||
import pathlib
|
import pathlib
|
||||||
import warnings
|
import warnings
|
||||||
from importlib import import_module
|
from importlib import import_module
|
||||||
from unittest.mock import ANY, Mock, patch
|
from unittest.mock import Mock, patch
|
||||||
|
|
||||||
import docx
|
import docx
|
||||||
import pytest
|
import pytest
|
||||||
@ -347,15 +347,17 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
|
|||||||
|
|
||||||
mock_partition.assert_called_once_with(
|
mock_partition.assert_called_once_with(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
metadata_filename=None,
|
|
||||||
file=None,
|
file=None,
|
||||||
url=None,
|
url=None,
|
||||||
include_page_breaks=False,
|
|
||||||
infer_table_structure=False,
|
|
||||||
extract_images_in_pdf=ANY,
|
|
||||||
image_output_dir_path=ANY,
|
|
||||||
strategy=PartitionStrategy.FAST,
|
strategy=PartitionStrategy.FAST,
|
||||||
languages=None,
|
languages=None,
|
||||||
|
metadata_filename=None,
|
||||||
|
include_page_breaks=False,
|
||||||
|
infer_table_structure=False,
|
||||||
|
extract_images_in_pdf=False,
|
||||||
|
extract_element_types=None,
|
||||||
|
image_output_dir_path=None,
|
||||||
|
extract_to_payload=False,
|
||||||
hi_res_model_name=None,
|
hi_res_model_name=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -173,9 +173,11 @@ class ElementMetadata:
|
|||||||
file_directory: Optional[str]
|
file_directory: Optional[str]
|
||||||
filename: Optional[str]
|
filename: Optional[str]
|
||||||
filetype: Optional[str]
|
filetype: Optional[str]
|
||||||
|
image_path: Optional[str]
|
||||||
|
image_base64: Optional[str]
|
||||||
|
image_mime_type: Optional[str]
|
||||||
# -- specific to DOCX which has distinct primary, first-page, and even-page header/footers --
|
# -- specific to DOCX which has distinct primary, first-page, and even-page header/footers --
|
||||||
header_footer_type: Optional[str]
|
header_footer_type: Optional[str]
|
||||||
image_path: Optional[str]
|
|
||||||
# -- used in chunks only, when chunk must be split mid-text to fit window --
|
# -- used in chunks only, when chunk must be split mid-text to fit window --
|
||||||
is_continuation: Optional[bool]
|
is_continuation: Optional[bool]
|
||||||
languages: Optional[List[str]]
|
languages: Optional[List[str]]
|
||||||
@ -457,6 +459,8 @@ class ConsolidationStrategy(enum.Enum):
|
|||||||
"filetype": cls.FIRST,
|
"filetype": cls.FIRST,
|
||||||
"header_footer_type": cls.DROP,
|
"header_footer_type": cls.DROP,
|
||||||
"image_path": cls.DROP,
|
"image_path": cls.DROP,
|
||||||
|
"image_base64": cls.DROP,
|
||||||
|
"image_mime_type": cls.DROP,
|
||||||
"is_continuation": cls.DROP, # -- not expected, added by chunking, not before --
|
"is_continuation": cls.DROP, # -- not expected, added by chunking, not before --
|
||||||
"languages": cls.LIST_UNIQUE,
|
"languages": cls.LIST_UNIQUE,
|
||||||
"last_modified": cls.FIRST,
|
"last_modified": cls.FIRST,
|
||||||
|
|||||||
@ -137,7 +137,9 @@ def partition(
|
|||||||
detect_language_per_element: bool = False,
|
detect_language_per_element: bool = False,
|
||||||
pdf_infer_table_structure: bool = False,
|
pdf_infer_table_structure: bool = False,
|
||||||
pdf_extract_images: bool = False,
|
pdf_extract_images: bool = False,
|
||||||
|
pdf_extract_element_types: Optional[List[str]] = None,
|
||||||
pdf_image_output_dir_path: Optional[str] = None,
|
pdf_image_output_dir_path: Optional[str] = None,
|
||||||
|
pdf_extract_to_payload: bool = False,
|
||||||
xml_keep_tags: bool = False,
|
xml_keep_tags: bool = False,
|
||||||
data_source_metadata: Optional[DataSourceMetadata] = None,
|
data_source_metadata: Optional[DataSourceMetadata] = None,
|
||||||
metadata_filename: Optional[str] = None,
|
metadata_filename: Optional[str] = None,
|
||||||
@ -193,11 +195,26 @@ def partition(
|
|||||||
transformation of the data into an HTML <table>.
|
transformation of the data into an HTML <table>.
|
||||||
The "text" field for a partitioned Table Element is always present, whether True or False.
|
The "text" field for a partitioned Table Element is always present, whether True or False.
|
||||||
pdf_extract_images
|
pdf_extract_images
|
||||||
If True and strategy=hi_res, any detected images will be saved in the path specified by
|
Only applicable if `strategy=hi_res`.
|
||||||
pdf_image_output_dir_path.
|
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
|
||||||
|
or stored as base64 encoded data within metadata fields.
|
||||||
|
Deprecation Note: This parameter is marked for deprecation. Future versions will use
|
||||||
|
'extract_element_types' for broader extraction capabilities.
|
||||||
|
pdf_extract_element_types
|
||||||
|
Only applicable if `strategy=hi_res`.
|
||||||
|
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
|
||||||
|
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
|
||||||
|
within metadata fields.
|
||||||
|
pdf_extract_to_payload
|
||||||
|
Only applicable if `strategy=hi_res`.
|
||||||
|
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
|
||||||
|
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
|
||||||
|
This parameter facilitates the inclusion of element data directly within the payload,
|
||||||
|
especially for web-based applications or APIs.
|
||||||
pdf_image_output_dir_path
|
pdf_image_output_dir_path
|
||||||
If pdf_extract_images=True and strategy=hi_res, any detected images will be saved in the
|
Only applicable if `strategy=hi_res` and `pdf_extract_to_payload=False`.
|
||||||
given path
|
The filesystem path for saving images of the element type(s)
|
||||||
|
specified in 'extract_element_types'.
|
||||||
xml_keep_tags
|
xml_keep_tags
|
||||||
If True, will retain the XML tags in the output. Otherwise it will simply extract
|
If True, will retain the XML tags in the output. Otherwise it will simply extract
|
||||||
the text from within the tags. Only applies to partition_xml.
|
the text from within the tags. Only applies to partition_xml.
|
||||||
@ -397,7 +414,9 @@ def partition(
|
|||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
extract_images_in_pdf=pdf_extract_images,
|
extract_images_in_pdf=pdf_extract_images,
|
||||||
|
extract_element_types=pdf_extract_element_types,
|
||||||
image_output_dir_path=pdf_image_output_dir_path,
|
image_output_dir_path=pdf_image_output_dir_path,
|
||||||
|
extract_to_payload=pdf_extract_to_payload,
|
||||||
hi_res_model_name=hi_res_model_name or model_name,
|
hi_res_model_name=hi_res_model_name or model_name,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -26,6 +26,10 @@ def partition_image(
|
|||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
chunking_strategy: Optional[str] = None,
|
chunking_strategy: Optional[str] = None,
|
||||||
hi_res_model_name: Optional[str] = None,
|
hi_res_model_name: Optional[str] = None,
|
||||||
|
extract_images_in_pdf: bool = False,
|
||||||
|
extract_element_types: Optional[List[str]] = None,
|
||||||
|
image_output_dir_path: Optional[str] = None,
|
||||||
|
extract_to_payload: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Parses an image into a list of interpreted elements.
|
"""Parses an image into a list of interpreted elements.
|
||||||
@ -58,6 +62,27 @@ def partition_image(
|
|||||||
The last modified date for the document.
|
The last modified date for the document.
|
||||||
hi_res_model_name
|
hi_res_model_name
|
||||||
The layout detection model used when partitioning strategy is set to `hi_res`.
|
The layout detection model used when partitioning strategy is set to `hi_res`.
|
||||||
|
extract_images_in_pdf
|
||||||
|
Only applicable if `strategy=hi_res`.
|
||||||
|
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
|
||||||
|
or stored as base64 encoded data within metadata fields.
|
||||||
|
Deprecation Note: This parameter is marked for deprecation. Future versions will use
|
||||||
|
'extract_element_types' for broader extraction capabilities.
|
||||||
|
extract_element_types
|
||||||
|
Only applicable if `strategy=hi_res`.
|
||||||
|
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
|
||||||
|
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
|
||||||
|
within metadata fields.
|
||||||
|
extract_to_payload
|
||||||
|
Only applicable if `strategy=hi_res`.
|
||||||
|
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
|
||||||
|
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
|
||||||
|
This parameter facilitates the inclusion of element data directly within the payload,
|
||||||
|
especially for web-based applications or APIs.
|
||||||
|
image_output_dir_path
|
||||||
|
Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
|
||||||
|
The filesystem path for saving images of the element type(s)
|
||||||
|
specified in 'extract_element_types'.
|
||||||
"""
|
"""
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file)
|
||||||
|
|
||||||
@ -93,5 +118,9 @@ def partition_image(
|
|||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
metadata_last_modified=metadata_last_modified,
|
metadata_last_modified=metadata_last_modified,
|
||||||
hi_res_model_name=hi_res_model_name,
|
hi_res_model_name=hi_res_model_name,
|
||||||
|
extract_images_in_pdf=extract_images_in_pdf,
|
||||||
|
extract_element_types=extract_element_types,
|
||||||
|
image_output_dir_path=image_output_dir_path,
|
||||||
|
extract_to_payload=extract_to_payload,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -106,7 +106,6 @@ from unstructured.utils import requires_dependencies
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
|
# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
|
||||||
# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
|
# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
|
||||||
psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore
|
psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore
|
||||||
@ -140,10 +139,11 @@ def partition_pdf(
|
|||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
chunking_strategy: Optional[str] = None, # used by decorator
|
chunking_strategy: Optional[str] = None, # used by decorator
|
||||||
links: Sequence[Link] = [],
|
links: Sequence[Link] = [],
|
||||||
|
hi_res_model_name: Optional[str] = None,
|
||||||
extract_images_in_pdf: bool = False,
|
extract_images_in_pdf: bool = False,
|
||||||
extract_element_types: Optional[List[str]] = None,
|
extract_element_types: Optional[List[str]] = None,
|
||||||
image_output_dir_path: Optional[str] = None,
|
image_output_dir_path: Optional[str] = None,
|
||||||
hi_res_model_name: Optional[str] = None,
|
extract_to_payload: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Parses a pdf document into a list of interpreted elements.
|
"""Parses a pdf document into a list of interpreted elements.
|
||||||
@ -173,18 +173,29 @@ def partition_pdf(
|
|||||||
with Tesseract, you'll first need to install the appropriate Tesseract language pack.
|
with Tesseract, you'll first need to install the appropriate Tesseract language pack.
|
||||||
metadata_last_modified
|
metadata_last_modified
|
||||||
The last modified date for the document.
|
The last modified date for the document.
|
||||||
extract_images_in_pdf
|
|
||||||
Only applicable if `strategy=hi_res`.
|
|
||||||
If `True`, any detected images will be saved in the path specified by
|
|
||||||
image_output_dir_path.
|
|
||||||
extract_element_types
|
|
||||||
Only applicable if `strategy=hi_res`.
|
|
||||||
Images of the element type(s) defined in this list will be saved to `image_output_dir_path`.
|
|
||||||
image_output_dir_path
|
|
||||||
Only applicable if `strategy=hi_res`.
|
|
||||||
The path for saving images when using `extract_images_in_pdf` or `extract_element_types`.
|
|
||||||
hi_res_model_name
|
hi_res_model_name
|
||||||
The layout detection model used when partitioning strategy is set to `hi_res`.
|
The layout detection model used when partitioning strategy is set to `hi_res`.
|
||||||
|
extract_images_in_pdf
|
||||||
|
Only applicable if `strategy=hi_res`.
|
||||||
|
If True, any detected images will be saved in the path specified by 'image_output_dir_path'
|
||||||
|
or stored as base64 encoded data within metadata fields.
|
||||||
|
Deprecation Note: This parameter is marked for deprecation. Future versions will use
|
||||||
|
'extract_element_types' for broader extraction capabilities.
|
||||||
|
extract_element_types
|
||||||
|
Only applicable if `strategy=hi_res`.
|
||||||
|
Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
|
||||||
|
saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
|
||||||
|
within metadata fields.
|
||||||
|
extract_to_payload
|
||||||
|
Only applicable if `strategy=hi_res`.
|
||||||
|
If True, images of the element type(s) defined in 'extract_element_types' will be encoded
|
||||||
|
as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
|
||||||
|
This parameter facilitates the inclusion of element data directly within the payload,
|
||||||
|
especially for web-based applications or APIs.
|
||||||
|
image_output_dir_path
|
||||||
|
Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
|
||||||
|
The filesystem path for saving images of the element type(s)
|
||||||
|
specified in 'extract_element_types'.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
exactly_one(filename=filename, file=file)
|
exactly_one(filename=filename, file=file)
|
||||||
@ -199,10 +210,11 @@ def partition_pdf(
|
|||||||
infer_table_structure=infer_table_structure,
|
infer_table_structure=infer_table_structure,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
metadata_last_modified=metadata_last_modified,
|
metadata_last_modified=metadata_last_modified,
|
||||||
|
hi_res_model_name=hi_res_model_name,
|
||||||
extract_images_in_pdf=extract_images_in_pdf,
|
extract_images_in_pdf=extract_images_in_pdf,
|
||||||
extract_element_types=extract_element_types,
|
extract_element_types=extract_element_types,
|
||||||
image_output_dir_path=image_output_dir_path,
|
image_output_dir_path=image_output_dir_path,
|
||||||
hi_res_model_name=hi_res_model_name,
|
extract_to_payload=extract_to_payload,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -249,13 +261,14 @@ def _partition_pdf_or_image_local(
|
|||||||
languages: Optional[List[str]] = None,
|
languages: Optional[List[str]] = None,
|
||||||
ocr_mode: str = OCRMode.FULL_PAGE.value,
|
ocr_mode: str = OCRMode.FULL_PAGE.value,
|
||||||
model_name: Optional[str] = None, # to be deprecated in favor of `hi_res_model_name`
|
model_name: Optional[str] = None, # to be deprecated in favor of `hi_res_model_name`
|
||||||
|
hi_res_model_name: Optional[str] = None,
|
||||||
|
pdf_image_dpi: Optional[int] = None,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
pdf_text_extractable: bool = False,
|
pdf_text_extractable: bool = False,
|
||||||
extract_images_in_pdf: bool = False,
|
extract_images_in_pdf: bool = False,
|
||||||
extract_element_types: Optional[List[str]] = None,
|
extract_element_types: Optional[List[str]] = None,
|
||||||
image_output_dir_path: Optional[str] = None,
|
image_output_dir_path: Optional[str] = None,
|
||||||
pdf_image_dpi: Optional[int] = None,
|
extract_to_payload: bool = False,
|
||||||
hi_res_model_name: Optional[str] = None,
|
|
||||||
analysis: bool = False,
|
analysis: bool = False,
|
||||||
analyzed_image_output_dir_path: Optional[str] = None,
|
analyzed_image_output_dir_path: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@ -402,7 +415,9 @@ def _partition_pdf_or_image_local(
|
|||||||
element_category_to_save=ElementType.IMAGE,
|
element_category_to_save=ElementType.IMAGE,
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
|
is_image=is_image,
|
||||||
pdf_image_dpi=pdf_image_dpi,
|
pdf_image_dpi=pdf_image_dpi,
|
||||||
|
extract_to_payload=extract_to_payload,
|
||||||
output_dir_path=image_output_dir_path,
|
output_dir_path=image_output_dir_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -415,7 +430,9 @@ def _partition_pdf_or_image_local(
|
|||||||
element_category_to_save=el_type,
|
element_category_to_save=el_type,
|
||||||
filename=filename,
|
filename=filename,
|
||||||
file=file,
|
file=file,
|
||||||
|
is_image=is_image,
|
||||||
pdf_image_dpi=pdf_image_dpi,
|
pdf_image_dpi=pdf_image_dpi,
|
||||||
|
extract_to_payload=extract_to_payload,
|
||||||
output_dir_path=image_output_dir_path,
|
output_dir_path=image_output_dir_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -425,10 +442,12 @@ def _partition_pdf_or_image_local(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if isinstance(el, Image):
|
if isinstance(el, Image):
|
||||||
# NOTE(crag): small chunks of text from Image elements tend to be garbage
|
if (
|
||||||
if not el.metadata.image_path and (
|
not extract_images_in_pdf
|
||||||
el.text is None or len(el.text) < 24 or el.text.find(" ") == -1
|
and ElementType.IMAGE not in extract_element_types
|
||||||
|
and (el.text is None or len(el.text) < 24 or el.text.find(" ") == -1)
|
||||||
):
|
):
|
||||||
|
# NOTE(crag): small chunks of text from Image elements tend to be garbage
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
out_elements.append(cast(Element, el))
|
out_elements.append(cast(Element, el))
|
||||||
@ -457,10 +476,11 @@ def partition_pdf_or_image(
|
|||||||
ocr_languages: Optional[str] = None,
|
ocr_languages: Optional[str] = None,
|
||||||
languages: Optional[List[str]] = None,
|
languages: Optional[List[str]] = None,
|
||||||
metadata_last_modified: Optional[str] = None,
|
metadata_last_modified: Optional[str] = None,
|
||||||
|
hi_res_model_name: Optional[str] = None,
|
||||||
extract_images_in_pdf: bool = False,
|
extract_images_in_pdf: bool = False,
|
||||||
extract_element_types: Optional[List[str]] = None,
|
extract_element_types: Optional[List[str]] = None,
|
||||||
image_output_dir_path: Optional[str] = None,
|
image_output_dir_path: Optional[str] = None,
|
||||||
hi_res_model_name: Optional[str] = None,
|
extract_to_payload: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[Element]:
|
) -> List[Element]:
|
||||||
"""Parses a pdf or image document into a list of interpreted elements."""
|
"""Parses a pdf or image document into a list of interpreted elements."""
|
||||||
@ -518,11 +538,12 @@ def partition_pdf_or_image(
|
|||||||
include_page_breaks=include_page_breaks,
|
include_page_breaks=include_page_breaks,
|
||||||
languages=languages,
|
languages=languages,
|
||||||
metadata_last_modified=metadata_last_modified or last_modification_date,
|
metadata_last_modified=metadata_last_modified or last_modification_date,
|
||||||
|
hi_res_model_name=hi_res_model_name,
|
||||||
pdf_text_extractable=pdf_text_extractable,
|
pdf_text_extractable=pdf_text_extractable,
|
||||||
extract_images_in_pdf=extract_images_in_pdf,
|
extract_images_in_pdf=extract_images_in_pdf,
|
||||||
extract_element_types=extract_element_types,
|
extract_element_types=extract_element_types,
|
||||||
image_output_dir_path=image_output_dir_path,
|
image_output_dir_path=image_output_dir_path,
|
||||||
hi_res_model_name=hi_res_model_name,
|
extract_to_payload=extract_to_payload,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
out_elements = _process_uncategorized_text_elements(elements)
|
out_elements = _process_uncategorized_text_elements(elements)
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
|
import base64
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from io import BytesIO
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
|
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
|
||||||
|
|
||||||
@ -79,11 +81,17 @@ def save_elements(
|
|||||||
pdf_image_dpi: int,
|
pdf_image_dpi: int,
|
||||||
filename: str = "",
|
filename: str = "",
|
||||||
file: Optional[Union[bytes, BinaryIO]] = None,
|
file: Optional[Union[bytes, BinaryIO]] = None,
|
||||||
|
is_image: bool = False,
|
||||||
|
extract_to_payload: bool = False,
|
||||||
output_dir_path: Optional[str] = None,
|
output_dir_path: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Extract and save images from the page. This method iterates through the layout elements
|
Saves specific elements from a PDF as images either to a directory or embeds them in the
|
||||||
of the page, identifies image regions, and extracts and saves them as separate image files.
|
element's payload.
|
||||||
|
|
||||||
|
This function processes a list of elements partitioned from a PDF file. For each element of
|
||||||
|
a specified category, it extracts and saves the image. The images can either be saved to
|
||||||
|
a specified directory or embedded into the element's payload as a base64-encoded string.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not output_dir_path:
|
if not output_dir_path:
|
||||||
@ -91,6 +99,17 @@ def save_elements(
|
|||||||
os.makedirs(output_dir_path, exist_ok=True)
|
os.makedirs(output_dir_path, exist_ok=True)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
if is_image:
|
||||||
|
if file is None:
|
||||||
|
image_paths = [filename]
|
||||||
|
else:
|
||||||
|
if hasattr(file, "seek"):
|
||||||
|
file.seek(0)
|
||||||
|
temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
|
||||||
|
temp_file.write(file.read() if hasattr(file, "read") else file)
|
||||||
|
temp_file.flush()
|
||||||
|
image_paths = [temp_file.name]
|
||||||
|
else:
|
||||||
_image_paths = convert_pdf_to_image(
|
_image_paths = convert_pdf_to_image(
|
||||||
filename,
|
filename,
|
||||||
file,
|
file,
|
||||||
@ -124,6 +143,14 @@ def save_elements(
|
|||||||
image_path = image_paths[page_number - 1]
|
image_path = image_paths[page_number - 1]
|
||||||
image = Image.open(image_path)
|
image = Image.open(image_path)
|
||||||
cropped_image = image.crop((x1, y1, x2, y2))
|
cropped_image = image.crop((x1, y1, x2, y2))
|
||||||
|
if extract_to_payload:
|
||||||
|
buffered = BytesIO()
|
||||||
|
cropped_image.save(buffered, format="JPEG")
|
||||||
|
img_base64 = base64.b64encode(buffered.getvalue())
|
||||||
|
img_base64_str = img_base64.decode()
|
||||||
|
el.metadata.image_base64 = img_base64_str
|
||||||
|
el.metadata.image_mime_type = "image/jpeg"
|
||||||
|
else:
|
||||||
write_image(cropped_image, output_f_path)
|
write_image(cropped_image, output_f_path)
|
||||||
# add image path to element metadata
|
# add image path to element metadata
|
||||||
el.metadata.image_path = output_f_path
|
el.metadata.image_path = output_f_path
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user