mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 03:53:45 +00:00 
			
		
		
		
	Feat: return base64 encoded images for PDF's (#2310)
Closes #2302. ### Summary - add functionality to get a Base64 encoded string from a PIL image - store base64 encoded image data in two metadata fields: `image_base64` and `image_mime_type` - update the "image element filter" logic to keep all image elements in the output if a user specifies image extraction ### Testing ``` from unstructured.partition.pdf import partition_pdf elements = partition_pdf( filename="example-docs/embedded-images-tables.pdf", strategy="hi_res", extract_element_types=["Image", "Table"], extract_to_payload=True, ) ``` or ``` from unstructured.partition.auto import partition elements = partition( filename="example-docs/embedded-images-tables.pdf", strategy="hi_res", pdf_extract_element_types=["Image", "Table"], pdf_extract_to_payload=True, ) ```
This commit is contained in:
		
							parent
							
								
									8ba9fadf8a
								
							
						
					
					
						commit
						dd144456de
					
				@ -6,6 +6,8 @@
 | 
				
			|||||||
* **Update encoders to leverage dataclasses** All encoders now follow a class approach which get annotated with the dataclass decorator. Similar to the connectors, it uses a nested dataclass for the configs required to configure a client as well as a field/property approach to cache the client. This makes sure any variable associated with the class exists as a dataclass field.
 | 
					* **Update encoders to leverage dataclasses** All encoders now follow a class approach which get annotated with the dataclass decorator. Similar to the connectors, it uses a nested dataclass for the configs required to configure a client as well as a field/property approach to cache the client. This makes sure any variable associated with the class exists as a dataclass field.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Features
 | 
					### Features
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					* **Store base64 encoded image data in metadata fields.** Rather than saving to file, stores base64 encoded data of the image bytes and the mimetype for the image in metadata fields: `image_base64` and `image_mime_type` (if that is what the user specifies by some other param like `pdf_extract_to_payload`). This would allow the API to have parity with the library.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Fixes
 | 
					### Fixes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										
											BIN
										
									
								
								example-docs/embedded-images-tables.jpg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								example-docs/embedded-images-tables.jpg
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| 
		 After Width: | Height: | Size: 251 KiB  | 
							
								
								
									
										944
									
								
								example-docs/embedded-images-tables.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										944
									
								
								example-docs/embedded-images-tables.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							@ -1,5 +1,6 @@
 | 
				
			|||||||
import os
 | 
					import os
 | 
				
			||||||
import pathlib
 | 
					import pathlib
 | 
				
			||||||
 | 
					import tempfile
 | 
				
			||||||
from unittest import mock
 | 
					from unittest import mock
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
@ -7,6 +8,7 @@ from PIL import Image
 | 
				
			|||||||
from pytesseract import TesseractError
 | 
					from pytesseract import TesseractError
 | 
				
			||||||
from unstructured_inference.inference import layout
 | 
					from unstructured_inference.inference import layout
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
 | 
				
			||||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
 | 
					from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
 | 
				
			||||||
from unstructured.chunking.title import chunk_by_title
 | 
					from unstructured.chunking.title import chunk_by_title
 | 
				
			||||||
from unstructured.documents.elements import ElementType
 | 
					from unstructured.documents.elements import ElementType
 | 
				
			||||||
@ -632,3 +634,34 @@ def test_partition_image_has_filename(inference_results):
 | 
				
			|||||||
    assert element.metadata.filetype == "JPEG"
 | 
					    assert element.metadata.filetype == "JPEG"
 | 
				
			||||||
    # This should be kept from the filename we originally gave
 | 
					    # This should be kept from the filename we originally gave
 | 
				
			||||||
    assert element.metadata.filename == filename
 | 
					    assert element.metadata.filename == filename
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("file_mode", ["filename", "rb"])
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("extract_to_payload", [False, True])
 | 
				
			||||||
 | 
					def test_partition_image_element_extraction(
 | 
				
			||||||
 | 
					    file_mode,
 | 
				
			||||||
 | 
					    extract_to_payload,
 | 
				
			||||||
 | 
					    filename=example_doc_path("embedded-images-tables.jpg"),
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    extract_element_types = ["Image", "Table"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with tempfile.TemporaryDirectory() as tmpdir:
 | 
				
			||||||
 | 
					        if file_mode == "filename":
 | 
				
			||||||
 | 
					            elements = image.partition_image(
 | 
				
			||||||
 | 
					                filename=filename,
 | 
				
			||||||
 | 
					                strategy="hi_res",
 | 
				
			||||||
 | 
					                extract_element_types=extract_element_types,
 | 
				
			||||||
 | 
					                extract_to_payload=extract_to_payload,
 | 
				
			||||||
 | 
					                image_output_dir_path=tmpdir,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            with open(filename, "rb") as f:
 | 
				
			||||||
 | 
					                elements = image.partition_image(
 | 
				
			||||||
 | 
					                    file=f,
 | 
				
			||||||
 | 
					                    strategy="hi_res",
 | 
				
			||||||
 | 
					                    extract_element_types=extract_element_types,
 | 
				
			||||||
 | 
					                    extract_to_payload=extract_to_payload,
 | 
				
			||||||
 | 
					                    image_output_dir_path=tmpdir,
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
 | 
				
			||||||
 | 
				
			|||||||
@ -1,6 +1,8 @@
 | 
				
			|||||||
 | 
					import base64
 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
import math
 | 
					import math
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
 | 
					import tempfile
 | 
				
			||||||
from tempfile import SpooledTemporaryFile
 | 
					from tempfile import SpooledTemporaryFile
 | 
				
			||||||
from unittest import mock
 | 
					from unittest import mock
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -15,6 +17,7 @@ from unstructured.documents.coordinates import PixelSpace
 | 
				
			|||||||
from unstructured.documents.elements import (
 | 
					from unstructured.documents.elements import (
 | 
				
			||||||
    CoordinatesMetadata,
 | 
					    CoordinatesMetadata,
 | 
				
			||||||
    ElementMetadata,
 | 
					    ElementMetadata,
 | 
				
			||||||
 | 
					    ElementType,
 | 
				
			||||||
    ListItem,
 | 
					    ListItem,
 | 
				
			||||||
    NarrativeText,
 | 
					    NarrativeText,
 | 
				
			||||||
    Text,
 | 
					    Text,
 | 
				
			||||||
@ -1123,3 +1126,62 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
 | 
				
			|||||||
    caplog.set_level(logging.INFO)
 | 
					    caplog.set_level(logging.INFO)
 | 
				
			||||||
    assert pdf.extractable_elements(filename=example_doc_path(filename))
 | 
					    assert pdf.extractable_elements(filename=example_doc_path(filename))
 | 
				
			||||||
    assert expected_log in caplog.text
 | 
					    assert expected_log in caplog.text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir):
 | 
				
			||||||
 | 
					    extracted_elements = []
 | 
				
			||||||
 | 
					    for el_type in extract_element_types:
 | 
				
			||||||
 | 
					        extracted_elements_by_type = []
 | 
				
			||||||
 | 
					        for el in elements:
 | 
				
			||||||
 | 
					            if el.category == el_type:
 | 
				
			||||||
 | 
					                extracted_elements_by_type.append(el)
 | 
				
			||||||
 | 
					        extracted_elements.append(extracted_elements_by_type)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for extracted_elements_by_type in extracted_elements:
 | 
				
			||||||
 | 
					        for i, el in enumerate(extracted_elements_by_type):
 | 
				
			||||||
 | 
					            if extract_to_payload:
 | 
				
			||||||
 | 
					                assert el.metadata.image_base64 is not None
 | 
				
			||||||
 | 
					                assert el.metadata.image_mime_type == "image/jpeg"
 | 
				
			||||||
 | 
					                image_data = base64.b64decode(el.metadata.image_base64)
 | 
				
			||||||
 | 
					                assert isinstance(image_data, bytes)
 | 
				
			||||||
 | 
					                assert el.metadata.image_path is None
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                basename = "table" if el.category == ElementType.TABLE else "figure"
 | 
				
			||||||
 | 
					                expected_image_path = os.path.join(
 | 
				
			||||||
 | 
					                    str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					                assert el.metadata.image_path == expected_image_path
 | 
				
			||||||
 | 
					                assert os.path.isfile(expected_image_path)
 | 
				
			||||||
 | 
					                assert el.metadata.image_base64 is None
 | 
				
			||||||
 | 
					                assert el.metadata.image_mime_type is None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("file_mode", ["filename", "rb"])
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("extract_to_payload", [False, True])
 | 
				
			||||||
 | 
					def test_partition_pdf_element_extraction(
 | 
				
			||||||
 | 
					    file_mode,
 | 
				
			||||||
 | 
					    extract_to_payload,
 | 
				
			||||||
 | 
					    filename=example_doc_path("embedded-images-tables.pdf"),
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    extract_element_types = ["Image", "Table"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with tempfile.TemporaryDirectory() as tmpdir:
 | 
				
			||||||
 | 
					        if file_mode == "filename":
 | 
				
			||||||
 | 
					            elements = pdf.partition_pdf(
 | 
				
			||||||
 | 
					                filename=filename,
 | 
				
			||||||
 | 
					                strategy="hi_res",
 | 
				
			||||||
 | 
					                extract_element_types=extract_element_types,
 | 
				
			||||||
 | 
					                extract_to_payload=extract_to_payload,
 | 
				
			||||||
 | 
					                image_output_dir_path=tmpdir,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            with open(filename, "rb") as f:
 | 
				
			||||||
 | 
					                elements = pdf.partition_pdf(
 | 
				
			||||||
 | 
					                    file=f,
 | 
				
			||||||
 | 
					                    strategy="hi_res",
 | 
				
			||||||
 | 
					                    extract_element_types=extract_element_types,
 | 
				
			||||||
 | 
					                    extract_to_payload=extract_to_payload,
 | 
				
			||||||
 | 
					                    image_output_dir_path=tmpdir,
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        assert_element_extraction(elements, extract_element_types, extract_to_payload, tmpdir)
 | 
				
			||||||
 | 
				
			|||||||
@ -7,7 +7,7 @@ from PIL import Image as PILImg
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from test_unstructured.unit_utils import example_doc_path
 | 
					from test_unstructured.unit_utils import example_doc_path
 | 
				
			||||||
from unstructured.documents.coordinates import PixelSpace
 | 
					from unstructured.documents.coordinates import PixelSpace
 | 
				
			||||||
from unstructured.documents.elements import ElementMetadata, ElementType, Image
 | 
					from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table
 | 
				
			||||||
from unstructured.partition.pdf_image import pdf_image_utils
 | 
					from unstructured.partition.pdf_image import pdf_image_utils
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -60,58 +60,66 @@ def test_convert_pdf_to_image(
 | 
				
			|||||||
            assert isinstance(images[0], PILImg.Image)
 | 
					            assert isinstance(images[0], PILImg.Image)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_save_elements(filename=example_doc_path("embedded-images.pdf")):
 | 
					@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("extract_to_payload", [False, True])
 | 
				
			||||||
 | 
					def test_save_elements(
 | 
				
			||||||
 | 
					    element_category_to_save,
 | 
				
			||||||
 | 
					    extract_to_payload,
 | 
				
			||||||
 | 
					    filename=example_doc_path("layout-parser-paper-fast.pdf"),
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
    with tempfile.TemporaryDirectory() as tmpdir:
 | 
					    with tempfile.TemporaryDirectory() as tmpdir:
 | 
				
			||||||
        elements = [
 | 
					        elements = [
 | 
				
			||||||
            Image(
 | 
					            Image(
 | 
				
			||||||
                text="3",
 | 
					                text="3",
 | 
				
			||||||
                coordinates=(
 | 
					                coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
 | 
				
			||||||
                    (78.7401411111111, 86.61545694444455),
 | 
					 | 
				
			||||||
                    (78.7401411111111, 519.9487805555556),
 | 
					 | 
				
			||||||
                    (512.0734647222223, 519.9487805555556),
 | 
					 | 
				
			||||||
                    (512.0734647222223, 86.61545694444455),
 | 
					 | 
				
			||||||
                ),
 | 
					 | 
				
			||||||
                coordinate_system=PixelSpace(width=1575, height=1166),
 | 
					                coordinate_system=PixelSpace(width=1575, height=1166),
 | 
				
			||||||
                metadata=ElementMetadata(page_number=1),
 | 
					                metadata=ElementMetadata(page_number=1),
 | 
				
			||||||
            ),
 | 
					            ),
 | 
				
			||||||
            Image(
 | 
					            Image(
 | 
				
			||||||
                text="4",
 | 
					                text="4",
 | 
				
			||||||
                coordinates=(
 | 
					                coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
 | 
				
			||||||
                    (570.8661397222222, 86.6154566666667),
 | 
					 | 
				
			||||||
                    (570.8661397222222, 519.6862825000001),
 | 
					 | 
				
			||||||
                    (1003.9369655555556, 519.6862825000001),
 | 
					 | 
				
			||||||
                    (1003.9369655555556, 86.6154566666667),
 | 
					 | 
				
			||||||
                ),
 | 
					 | 
				
			||||||
                coordinate_system=PixelSpace(width=1575, height=1166),
 | 
					                coordinate_system=PixelSpace(width=1575, height=1166),
 | 
				
			||||||
                metadata=ElementMetadata(page_number=1),
 | 
					                metadata=ElementMetadata(page_number=1),
 | 
				
			||||||
            ),
 | 
					            ),
 | 
				
			||||||
            Image(
 | 
					            Image(
 | 
				
			||||||
                text="5",
 | 
					                text="5",
 | 
				
			||||||
                coordinates=(
 | 
					                coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
 | 
				
			||||||
                    (1062.9921808333331, 86.61545694444455),
 | 
					 | 
				
			||||||
                    (1062.9921808333331, 519.9487805555556),
 | 
					 | 
				
			||||||
                    (1496.3255044444445, 519.9487805555556),
 | 
					 | 
				
			||||||
                    (1496.3255044444445, 86.61545694444455),
 | 
					 | 
				
			||||||
                ),
 | 
					 | 
				
			||||||
                coordinate_system=PixelSpace(width=1575, height=1166),
 | 
					                coordinate_system=PixelSpace(width=1575, height=1166),
 | 
				
			||||||
                metadata=ElementMetadata(page_number=1),
 | 
					                metadata=ElementMetadata(page_number=1),
 | 
				
			||||||
            ),
 | 
					            ),
 | 
				
			||||||
 | 
					            Table(
 | 
				
			||||||
 | 
					                text="Sample Table",
 | 
				
			||||||
 | 
					                coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
 | 
				
			||||||
 | 
					                coordinate_system=PixelSpace(width=1575, height=1166),
 | 
				
			||||||
 | 
					                metadata=ElementMetadata(page_number=2),
 | 
				
			||||||
 | 
					            ),
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        pdf_image_utils.save_elements(
 | 
					        pdf_image_utils.save_elements(
 | 
				
			||||||
            elements=elements,
 | 
					            elements=elements,
 | 
				
			||||||
            element_category_to_save=ElementType.IMAGE,
 | 
					            element_category_to_save=element_category_to_save,
 | 
				
			||||||
            pdf_image_dpi=200,
 | 
					            pdf_image_dpi=200,
 | 
				
			||||||
            filename=filename,
 | 
					            filename=filename,
 | 
				
			||||||
            output_dir_path=str(tmpdir),
 | 
					            output_dir_path=str(tmpdir),
 | 
				
			||||||
 | 
					            extract_to_payload=extract_to_payload,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for i, el in enumerate(elements):
 | 
					        saved_elements = [el for el in elements if el.category == element_category_to_save]
 | 
				
			||||||
 | 
					        for i, el in enumerate(saved_elements):
 | 
				
			||||||
 | 
					            basename = "table" if el.category == ElementType.TABLE else "figure"
 | 
				
			||||||
            expected_image_path = os.path.join(
 | 
					            expected_image_path = os.path.join(
 | 
				
			||||||
                str(tmpdir), f"figure-{el.metadata.page_number}-{i + 1}.jpg"
 | 
					                str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            assert os.path.isfile(el.metadata.image_path)
 | 
					            if extract_to_payload:
 | 
				
			||||||
            assert el.metadata.image_path == expected_image_path
 | 
					                assert isinstance(el.metadata.image_base64, str)
 | 
				
			||||||
 | 
					                assert isinstance(el.metadata.image_mime_type, str)
 | 
				
			||||||
 | 
					                assert not el.metadata.image_path
 | 
				
			||||||
 | 
					                assert not os.path.isfile(expected_image_path)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                assert os.path.isfile(expected_image_path)
 | 
				
			||||||
 | 
					                assert el.metadata.image_path == expected_image_path
 | 
				
			||||||
 | 
					                assert not el.metadata.image_base64
 | 
				
			||||||
 | 
					                assert not el.metadata.image_mime_type
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_write_image_raises_error():
 | 
					def test_write_image_raises_error():
 | 
				
			||||||
 | 
				
			|||||||
@ -3,7 +3,7 @@ import os
 | 
				
			|||||||
import pathlib
 | 
					import pathlib
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
from importlib import import_module
 | 
					from importlib import import_module
 | 
				
			||||||
from unittest.mock import ANY, Mock, patch
 | 
					from unittest.mock import Mock, patch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import docx
 | 
					import docx
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
@ -347,15 +347,17 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    mock_partition.assert_called_once_with(
 | 
					    mock_partition.assert_called_once_with(
 | 
				
			||||||
        filename=filename,
 | 
					        filename=filename,
 | 
				
			||||||
        metadata_filename=None,
 | 
					 | 
				
			||||||
        file=None,
 | 
					        file=None,
 | 
				
			||||||
        url=None,
 | 
					        url=None,
 | 
				
			||||||
        include_page_breaks=False,
 | 
					 | 
				
			||||||
        infer_table_structure=False,
 | 
					 | 
				
			||||||
        extract_images_in_pdf=ANY,
 | 
					 | 
				
			||||||
        image_output_dir_path=ANY,
 | 
					 | 
				
			||||||
        strategy=PartitionStrategy.FAST,
 | 
					        strategy=PartitionStrategy.FAST,
 | 
				
			||||||
        languages=None,
 | 
					        languages=None,
 | 
				
			||||||
 | 
					        metadata_filename=None,
 | 
				
			||||||
 | 
					        include_page_breaks=False,
 | 
				
			||||||
 | 
					        infer_table_structure=False,
 | 
				
			||||||
 | 
					        extract_images_in_pdf=False,
 | 
				
			||||||
 | 
					        extract_element_types=None,
 | 
				
			||||||
 | 
					        image_output_dir_path=None,
 | 
				
			||||||
 | 
					        extract_to_payload=False,
 | 
				
			||||||
        hi_res_model_name=None,
 | 
					        hi_res_model_name=None,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -173,9 +173,11 @@ class ElementMetadata:
 | 
				
			|||||||
    file_directory: Optional[str]
 | 
					    file_directory: Optional[str]
 | 
				
			||||||
    filename: Optional[str]
 | 
					    filename: Optional[str]
 | 
				
			||||||
    filetype: Optional[str]
 | 
					    filetype: Optional[str]
 | 
				
			||||||
 | 
					    image_path: Optional[str]
 | 
				
			||||||
 | 
					    image_base64: Optional[str]
 | 
				
			||||||
 | 
					    image_mime_type: Optional[str]
 | 
				
			||||||
    # -- specific to DOCX which has distinct primary, first-page, and even-page header/footers --
 | 
					    # -- specific to DOCX which has distinct primary, first-page, and even-page header/footers --
 | 
				
			||||||
    header_footer_type: Optional[str]
 | 
					    header_footer_type: Optional[str]
 | 
				
			||||||
    image_path: Optional[str]
 | 
					 | 
				
			||||||
    # -- used in chunks only, when chunk must be split mid-text to fit window --
 | 
					    # -- used in chunks only, when chunk must be split mid-text to fit window --
 | 
				
			||||||
    is_continuation: Optional[bool]
 | 
					    is_continuation: Optional[bool]
 | 
				
			||||||
    languages: Optional[List[str]]
 | 
					    languages: Optional[List[str]]
 | 
				
			||||||
@ -457,6 +459,8 @@ class ConsolidationStrategy(enum.Enum):
 | 
				
			|||||||
            "filetype": cls.FIRST,
 | 
					            "filetype": cls.FIRST,
 | 
				
			||||||
            "header_footer_type": cls.DROP,
 | 
					            "header_footer_type": cls.DROP,
 | 
				
			||||||
            "image_path": cls.DROP,
 | 
					            "image_path": cls.DROP,
 | 
				
			||||||
 | 
					            "image_base64": cls.DROP,
 | 
				
			||||||
 | 
					            "image_mime_type": cls.DROP,
 | 
				
			||||||
            "is_continuation": cls.DROP,  # -- not expected, added by chunking, not before --
 | 
					            "is_continuation": cls.DROP,  # -- not expected, added by chunking, not before --
 | 
				
			||||||
            "languages": cls.LIST_UNIQUE,
 | 
					            "languages": cls.LIST_UNIQUE,
 | 
				
			||||||
            "last_modified": cls.FIRST,
 | 
					            "last_modified": cls.FIRST,
 | 
				
			||||||
 | 
				
			|||||||
@ -137,7 +137,9 @@ def partition(
 | 
				
			|||||||
    detect_language_per_element: bool = False,
 | 
					    detect_language_per_element: bool = False,
 | 
				
			||||||
    pdf_infer_table_structure: bool = False,
 | 
					    pdf_infer_table_structure: bool = False,
 | 
				
			||||||
    pdf_extract_images: bool = False,
 | 
					    pdf_extract_images: bool = False,
 | 
				
			||||||
 | 
					    pdf_extract_element_types: Optional[List[str]] = None,
 | 
				
			||||||
    pdf_image_output_dir_path: Optional[str] = None,
 | 
					    pdf_image_output_dir_path: Optional[str] = None,
 | 
				
			||||||
 | 
					    pdf_extract_to_payload: bool = False,
 | 
				
			||||||
    xml_keep_tags: bool = False,
 | 
					    xml_keep_tags: bool = False,
 | 
				
			||||||
    data_source_metadata: Optional[DataSourceMetadata] = None,
 | 
					    data_source_metadata: Optional[DataSourceMetadata] = None,
 | 
				
			||||||
    metadata_filename: Optional[str] = None,
 | 
					    metadata_filename: Optional[str] = None,
 | 
				
			||||||
@ -193,11 +195,26 @@ def partition(
 | 
				
			|||||||
        transformation of the data into an HTML <table>.
 | 
					        transformation of the data into an HTML <table>.
 | 
				
			||||||
        The "text" field for a partitioned Table Element is always present, whether True or False.
 | 
					        The "text" field for a partitioned Table Element is always present, whether True or False.
 | 
				
			||||||
    pdf_extract_images
 | 
					    pdf_extract_images
 | 
				
			||||||
        If True and strategy=hi_res, any detected images will be saved in the path specified by
 | 
					        Only applicable if `strategy=hi_res`.
 | 
				
			||||||
        pdf_image_output_dir_path.
 | 
					        If True, any detected images will be saved in the path specified by 'image_output_dir_path'
 | 
				
			||||||
 | 
					        or stored as base64 encoded data within metadata fields.
 | 
				
			||||||
 | 
					        Deprecation Note: This parameter is marked for deprecation. Future versions will use
 | 
				
			||||||
 | 
					        'extract_element_types' for broader extraction capabilities.
 | 
				
			||||||
 | 
					    pdf_extract_element_types
 | 
				
			||||||
 | 
					        Only applicable if `strategy=hi_res`.
 | 
				
			||||||
 | 
					        Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
 | 
				
			||||||
 | 
					        saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
 | 
				
			||||||
 | 
					        within metadata fields.
 | 
				
			||||||
 | 
					    pdf_extract_to_payload
 | 
				
			||||||
 | 
					        Only applicable if `strategy=hi_res`.
 | 
				
			||||||
 | 
					        If True, images of the element type(s) defined in 'extract_element_types' will be encoded
 | 
				
			||||||
 | 
					        as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
 | 
				
			||||||
 | 
					        This parameter facilitates the inclusion of element data directly within the payload,
 | 
				
			||||||
 | 
					        especially for web-based applications or APIs.
 | 
				
			||||||
    pdf_image_output_dir_path
 | 
					    pdf_image_output_dir_path
 | 
				
			||||||
        If pdf_extract_images=True and strategy=hi_res, any detected images will be saved in the
 | 
					        Only applicable if `strategy=hi_res` and `pdf_extract_to_payload=False`.
 | 
				
			||||||
        given path
 | 
					        The filesystem path for saving images of the element type(s)
 | 
				
			||||||
 | 
					        specified in 'extract_element_types'.
 | 
				
			||||||
    xml_keep_tags
 | 
					    xml_keep_tags
 | 
				
			||||||
        If True, will retain the XML tags in the output. Otherwise it will simply extract
 | 
					        If True, will retain the XML tags in the output. Otherwise it will simply extract
 | 
				
			||||||
        the text from within the tags. Only applies to partition_xml.
 | 
					        the text from within the tags. Only applies to partition_xml.
 | 
				
			||||||
@ -397,7 +414,9 @@ def partition(
 | 
				
			|||||||
            strategy=strategy,
 | 
					            strategy=strategy,
 | 
				
			||||||
            languages=languages,
 | 
					            languages=languages,
 | 
				
			||||||
            extract_images_in_pdf=pdf_extract_images,
 | 
					            extract_images_in_pdf=pdf_extract_images,
 | 
				
			||||||
 | 
					            extract_element_types=pdf_extract_element_types,
 | 
				
			||||||
            image_output_dir_path=pdf_image_output_dir_path,
 | 
					            image_output_dir_path=pdf_image_output_dir_path,
 | 
				
			||||||
 | 
					            extract_to_payload=pdf_extract_to_payload,
 | 
				
			||||||
            hi_res_model_name=hi_res_model_name or model_name,
 | 
					            hi_res_model_name=hi_res_model_name or model_name,
 | 
				
			||||||
            **kwargs,
 | 
					            **kwargs,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
				
			|||||||
@ -26,6 +26,10 @@ def partition_image(
 | 
				
			|||||||
    metadata_last_modified: Optional[str] = None,
 | 
					    metadata_last_modified: Optional[str] = None,
 | 
				
			||||||
    chunking_strategy: Optional[str] = None,
 | 
					    chunking_strategy: Optional[str] = None,
 | 
				
			||||||
    hi_res_model_name: Optional[str] = None,
 | 
					    hi_res_model_name: Optional[str] = None,
 | 
				
			||||||
 | 
					    extract_images_in_pdf: bool = False,
 | 
				
			||||||
 | 
					    extract_element_types: Optional[List[str]] = None,
 | 
				
			||||||
 | 
					    image_output_dir_path: Optional[str] = None,
 | 
				
			||||||
 | 
					    extract_to_payload: bool = False,
 | 
				
			||||||
    **kwargs,
 | 
					    **kwargs,
 | 
				
			||||||
) -> List[Element]:
 | 
					) -> List[Element]:
 | 
				
			||||||
    """Parses an image into a list of interpreted elements.
 | 
					    """Parses an image into a list of interpreted elements.
 | 
				
			||||||
@ -58,6 +62,27 @@ def partition_image(
 | 
				
			|||||||
        The last modified date for the document.
 | 
					        The last modified date for the document.
 | 
				
			||||||
    hi_res_model_name
 | 
					    hi_res_model_name
 | 
				
			||||||
        The layout detection model used when partitioning strategy is set to `hi_res`.
 | 
					        The layout detection model used when partitioning strategy is set to `hi_res`.
 | 
				
			||||||
 | 
					    extract_images_in_pdf
 | 
				
			||||||
 | 
					        Only applicable if `strategy=hi_res`.
 | 
				
			||||||
 | 
					        If True, any detected images will be saved in the path specified by 'image_output_dir_path'
 | 
				
			||||||
 | 
					        or stored as base64 encoded data within metadata fields.
 | 
				
			||||||
 | 
					        Deprecation Note: This parameter is marked for deprecation. Future versions will use
 | 
				
			||||||
 | 
					        'extract_element_types' for broader extraction capabilities.
 | 
				
			||||||
 | 
					    extract_element_types
 | 
				
			||||||
 | 
					        Only applicable if `strategy=hi_res`.
 | 
				
			||||||
 | 
					        Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
 | 
				
			||||||
 | 
					        saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
 | 
				
			||||||
 | 
					        within metadata fields.
 | 
				
			||||||
 | 
					    extract_to_payload
 | 
				
			||||||
 | 
					        Only applicable if `strategy=hi_res`.
 | 
				
			||||||
 | 
					        If True, images of the element type(s) defined in 'extract_element_types' will be encoded
 | 
				
			||||||
 | 
					        as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
 | 
				
			||||||
 | 
					        This parameter facilitates the inclusion of element data directly within the payload,
 | 
				
			||||||
 | 
					        especially for web-based applications or APIs.
 | 
				
			||||||
 | 
					    image_output_dir_path
 | 
				
			||||||
 | 
					        Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
 | 
				
			||||||
 | 
					        The filesystem path for saving images of the element type(s)
 | 
				
			||||||
 | 
					        specified in 'extract_element_types'.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    exactly_one(filename=filename, file=file)
 | 
					    exactly_one(filename=filename, file=file)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -93,5 +118,9 @@ def partition_image(
 | 
				
			|||||||
        strategy=strategy,
 | 
					        strategy=strategy,
 | 
				
			||||||
        metadata_last_modified=metadata_last_modified,
 | 
					        metadata_last_modified=metadata_last_modified,
 | 
				
			||||||
        hi_res_model_name=hi_res_model_name,
 | 
					        hi_res_model_name=hi_res_model_name,
 | 
				
			||||||
 | 
					        extract_images_in_pdf=extract_images_in_pdf,
 | 
				
			||||||
 | 
					        extract_element_types=extract_element_types,
 | 
				
			||||||
 | 
					        image_output_dir_path=image_output_dir_path,
 | 
				
			||||||
 | 
					        extract_to_payload=extract_to_payload,
 | 
				
			||||||
        **kwargs,
 | 
					        **kwargs,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
				
			|||||||
@ -106,7 +106,6 @@ from unstructured.utils import requires_dependencies
 | 
				
			|||||||
if TYPE_CHECKING:
 | 
					if TYPE_CHECKING:
 | 
				
			||||||
    pass
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
 | 
					# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
 | 
				
			||||||
# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
 | 
					# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
 | 
				
			||||||
psparser.PSBaseParser._parse_keyword = parse_keyword  # type: ignore
 | 
					psparser.PSBaseParser._parse_keyword = parse_keyword  # type: ignore
 | 
				
			||||||
@ -140,10 +139,11 @@ def partition_pdf(
 | 
				
			|||||||
    metadata_last_modified: Optional[str] = None,
 | 
					    metadata_last_modified: Optional[str] = None,
 | 
				
			||||||
    chunking_strategy: Optional[str] = None,  # used by decorator
 | 
					    chunking_strategy: Optional[str] = None,  # used by decorator
 | 
				
			||||||
    links: Sequence[Link] = [],
 | 
					    links: Sequence[Link] = [],
 | 
				
			||||||
 | 
					    hi_res_model_name: Optional[str] = None,
 | 
				
			||||||
    extract_images_in_pdf: bool = False,
 | 
					    extract_images_in_pdf: bool = False,
 | 
				
			||||||
    extract_element_types: Optional[List[str]] = None,
 | 
					    extract_element_types: Optional[List[str]] = None,
 | 
				
			||||||
    image_output_dir_path: Optional[str] = None,
 | 
					    image_output_dir_path: Optional[str] = None,
 | 
				
			||||||
    hi_res_model_name: Optional[str] = None,
 | 
					    extract_to_payload: bool = False,
 | 
				
			||||||
    **kwargs,
 | 
					    **kwargs,
 | 
				
			||||||
) -> List[Element]:
 | 
					) -> List[Element]:
 | 
				
			||||||
    """Parses a pdf document into a list of interpreted elements.
 | 
					    """Parses a pdf document into a list of interpreted elements.
 | 
				
			||||||
@ -173,18 +173,29 @@ def partition_pdf(
 | 
				
			|||||||
        with Tesseract, you'll first need to install the appropriate Tesseract language pack.
 | 
					        with Tesseract, you'll first need to install the appropriate Tesseract language pack.
 | 
				
			||||||
    metadata_last_modified
 | 
					    metadata_last_modified
 | 
				
			||||||
        The last modified date for the document.
 | 
					        The last modified date for the document.
 | 
				
			||||||
    extract_images_in_pdf
 | 
					 | 
				
			||||||
        Only applicable if `strategy=hi_res`.
 | 
					 | 
				
			||||||
        If `True`, any detected images will be saved in the path specified by
 | 
					 | 
				
			||||||
        image_output_dir_path.
 | 
					 | 
				
			||||||
    extract_element_types
 | 
					 | 
				
			||||||
        Only applicable if `strategy=hi_res`.
 | 
					 | 
				
			||||||
        Images of the element type(s) defined in this list will be saved to `image_output_dir_path`.
 | 
					 | 
				
			||||||
    image_output_dir_path
 | 
					 | 
				
			||||||
        Only applicable if `strategy=hi_res`.
 | 
					 | 
				
			||||||
        The path for saving images when using `extract_images_in_pdf` or `extract_element_types`.
 | 
					 | 
				
			||||||
    hi_res_model_name
 | 
					    hi_res_model_name
 | 
				
			||||||
        The layout detection model used when partitioning strategy is set to `hi_res`.
 | 
					        The layout detection model used when partitioning strategy is set to `hi_res`.
 | 
				
			||||||
 | 
					    extract_images_in_pdf
 | 
				
			||||||
 | 
					        Only applicable if `strategy=hi_res`.
 | 
				
			||||||
 | 
					        If True, any detected images will be saved in the path specified by 'image_output_dir_path'
 | 
				
			||||||
 | 
					        or stored as base64 encoded data within metadata fields.
 | 
				
			||||||
 | 
					        Deprecation Note: This parameter is marked for deprecation. Future versions will use
 | 
				
			||||||
 | 
					        'extract_element_types' for broader extraction capabilities.
 | 
				
			||||||
 | 
					    extract_element_types
 | 
				
			||||||
 | 
					        Only applicable if `strategy=hi_res`.
 | 
				
			||||||
 | 
					        Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
 | 
				
			||||||
 | 
					        saved in the path specified by 'image_output_dir_path' or stored as base64 encoded data
 | 
				
			||||||
 | 
					        within metadata fields.
 | 
				
			||||||
 | 
					    extract_to_payload
 | 
				
			||||||
 | 
					        Only applicable if `strategy=hi_res`.
 | 
				
			||||||
 | 
					        If True, images of the element type(s) defined in 'extract_element_types' will be encoded
 | 
				
			||||||
 | 
					        as base64 data and stored in two metadata fields: 'image_base64' and 'image_mime_type'.
 | 
				
			||||||
 | 
					        This parameter facilitates the inclusion of element data directly within the payload,
 | 
				
			||||||
 | 
					        especially for web-based applications or APIs.
 | 
				
			||||||
 | 
					    image_output_dir_path
 | 
				
			||||||
 | 
					        Only applicable if `strategy=hi_res` and `extract_to_payload=False`.
 | 
				
			||||||
 | 
					        The filesystem path for saving images of the element type(s)
 | 
				
			||||||
 | 
					        specified in 'extract_element_types'.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    exactly_one(filename=filename, file=file)
 | 
					    exactly_one(filename=filename, file=file)
 | 
				
			||||||
@ -199,10 +210,11 @@ def partition_pdf(
 | 
				
			|||||||
        infer_table_structure=infer_table_structure,
 | 
					        infer_table_structure=infer_table_structure,
 | 
				
			||||||
        languages=languages,
 | 
					        languages=languages,
 | 
				
			||||||
        metadata_last_modified=metadata_last_modified,
 | 
					        metadata_last_modified=metadata_last_modified,
 | 
				
			||||||
 | 
					        hi_res_model_name=hi_res_model_name,
 | 
				
			||||||
        extract_images_in_pdf=extract_images_in_pdf,
 | 
					        extract_images_in_pdf=extract_images_in_pdf,
 | 
				
			||||||
        extract_element_types=extract_element_types,
 | 
					        extract_element_types=extract_element_types,
 | 
				
			||||||
        image_output_dir_path=image_output_dir_path,
 | 
					        image_output_dir_path=image_output_dir_path,
 | 
				
			||||||
        hi_res_model_name=hi_res_model_name,
 | 
					        extract_to_payload=extract_to_payload,
 | 
				
			||||||
        **kwargs,
 | 
					        **kwargs,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -249,13 +261,14 @@ def _partition_pdf_or_image_local(
 | 
				
			|||||||
    languages: Optional[List[str]] = None,
 | 
					    languages: Optional[List[str]] = None,
 | 
				
			||||||
    ocr_mode: str = OCRMode.FULL_PAGE.value,
 | 
					    ocr_mode: str = OCRMode.FULL_PAGE.value,
 | 
				
			||||||
    model_name: Optional[str] = None,  # to be deprecated in favor of `hi_res_model_name`
 | 
					    model_name: Optional[str] = None,  # to be deprecated in favor of `hi_res_model_name`
 | 
				
			||||||
 | 
					    hi_res_model_name: Optional[str] = None,
 | 
				
			||||||
 | 
					    pdf_image_dpi: Optional[int] = None,
 | 
				
			||||||
    metadata_last_modified: Optional[str] = None,
 | 
					    metadata_last_modified: Optional[str] = None,
 | 
				
			||||||
    pdf_text_extractable: bool = False,
 | 
					    pdf_text_extractable: bool = False,
 | 
				
			||||||
    extract_images_in_pdf: bool = False,
 | 
					    extract_images_in_pdf: bool = False,
 | 
				
			||||||
    extract_element_types: Optional[List[str]] = None,
 | 
					    extract_element_types: Optional[List[str]] = None,
 | 
				
			||||||
    image_output_dir_path: Optional[str] = None,
 | 
					    image_output_dir_path: Optional[str] = None,
 | 
				
			||||||
    pdf_image_dpi: Optional[int] = None,
 | 
					    extract_to_payload: bool = False,
 | 
				
			||||||
    hi_res_model_name: Optional[str] = None,
 | 
					 | 
				
			||||||
    analysis: bool = False,
 | 
					    analysis: bool = False,
 | 
				
			||||||
    analyzed_image_output_dir_path: Optional[str] = None,
 | 
					    analyzed_image_output_dir_path: Optional[str] = None,
 | 
				
			||||||
    **kwargs,
 | 
					    **kwargs,
 | 
				
			||||||
@ -402,7 +415,9 @@ def _partition_pdf_or_image_local(
 | 
				
			|||||||
            element_category_to_save=ElementType.IMAGE,
 | 
					            element_category_to_save=ElementType.IMAGE,
 | 
				
			||||||
            filename=filename,
 | 
					            filename=filename,
 | 
				
			||||||
            file=file,
 | 
					            file=file,
 | 
				
			||||||
 | 
					            is_image=is_image,
 | 
				
			||||||
            pdf_image_dpi=pdf_image_dpi,
 | 
					            pdf_image_dpi=pdf_image_dpi,
 | 
				
			||||||
 | 
					            extract_to_payload=extract_to_payload,
 | 
				
			||||||
            output_dir_path=image_output_dir_path,
 | 
					            output_dir_path=image_output_dir_path,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -415,7 +430,9 @@ def _partition_pdf_or_image_local(
 | 
				
			|||||||
            element_category_to_save=el_type,
 | 
					            element_category_to_save=el_type,
 | 
				
			||||||
            filename=filename,
 | 
					            filename=filename,
 | 
				
			||||||
            file=file,
 | 
					            file=file,
 | 
				
			||||||
 | 
					            is_image=is_image,
 | 
				
			||||||
            pdf_image_dpi=pdf_image_dpi,
 | 
					            pdf_image_dpi=pdf_image_dpi,
 | 
				
			||||||
 | 
					            extract_to_payload=extract_to_payload,
 | 
				
			||||||
            output_dir_path=image_output_dir_path,
 | 
					            output_dir_path=image_output_dir_path,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -425,10 +442,12 @@ def _partition_pdf_or_image_local(
 | 
				
			|||||||
            continue
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if isinstance(el, Image):
 | 
					        if isinstance(el, Image):
 | 
				
			||||||
            # NOTE(crag): small chunks of text from Image elements tend to be garbage
 | 
					            if (
 | 
				
			||||||
            if not el.metadata.image_path and (
 | 
					                not extract_images_in_pdf
 | 
				
			||||||
                el.text is None or len(el.text) < 24 or el.text.find(" ") == -1
 | 
					                and ElementType.IMAGE not in extract_element_types
 | 
				
			||||||
 | 
					                and (el.text is None or len(el.text) < 24 or el.text.find(" ") == -1)
 | 
				
			||||||
            ):
 | 
					            ):
 | 
				
			||||||
 | 
					                # NOTE(crag): small chunks of text from Image elements tend to be garbage
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                out_elements.append(cast(Element, el))
 | 
					                out_elements.append(cast(Element, el))
 | 
				
			||||||
@ -457,10 +476,11 @@ def partition_pdf_or_image(
 | 
				
			|||||||
    ocr_languages: Optional[str] = None,
 | 
					    ocr_languages: Optional[str] = None,
 | 
				
			||||||
    languages: Optional[List[str]] = None,
 | 
					    languages: Optional[List[str]] = None,
 | 
				
			||||||
    metadata_last_modified: Optional[str] = None,
 | 
					    metadata_last_modified: Optional[str] = None,
 | 
				
			||||||
 | 
					    hi_res_model_name: Optional[str] = None,
 | 
				
			||||||
    extract_images_in_pdf: bool = False,
 | 
					    extract_images_in_pdf: bool = False,
 | 
				
			||||||
    extract_element_types: Optional[List[str]] = None,
 | 
					    extract_element_types: Optional[List[str]] = None,
 | 
				
			||||||
    image_output_dir_path: Optional[str] = None,
 | 
					    image_output_dir_path: Optional[str] = None,
 | 
				
			||||||
    hi_res_model_name: Optional[str] = None,
 | 
					    extract_to_payload: bool = False,
 | 
				
			||||||
    **kwargs,
 | 
					    **kwargs,
 | 
				
			||||||
) -> List[Element]:
 | 
					) -> List[Element]:
 | 
				
			||||||
    """Parses a pdf or image document into a list of interpreted elements."""
 | 
					    """Parses a pdf or image document into a list of interpreted elements."""
 | 
				
			||||||
@ -518,11 +538,12 @@ def partition_pdf_or_image(
 | 
				
			|||||||
                include_page_breaks=include_page_breaks,
 | 
					                include_page_breaks=include_page_breaks,
 | 
				
			||||||
                languages=languages,
 | 
					                languages=languages,
 | 
				
			||||||
                metadata_last_modified=metadata_last_modified or last_modification_date,
 | 
					                metadata_last_modified=metadata_last_modified or last_modification_date,
 | 
				
			||||||
 | 
					                hi_res_model_name=hi_res_model_name,
 | 
				
			||||||
                pdf_text_extractable=pdf_text_extractable,
 | 
					                pdf_text_extractable=pdf_text_extractable,
 | 
				
			||||||
                extract_images_in_pdf=extract_images_in_pdf,
 | 
					                extract_images_in_pdf=extract_images_in_pdf,
 | 
				
			||||||
                extract_element_types=extract_element_types,
 | 
					                extract_element_types=extract_element_types,
 | 
				
			||||||
                image_output_dir_path=image_output_dir_path,
 | 
					                image_output_dir_path=image_output_dir_path,
 | 
				
			||||||
                hi_res_model_name=hi_res_model_name,
 | 
					                extract_to_payload=extract_to_payload,
 | 
				
			||||||
                **kwargs,
 | 
					                **kwargs,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            out_elements = _process_uncategorized_text_elements(elements)
 | 
					            out_elements = _process_uncategorized_text_elements(elements)
 | 
				
			||||||
 | 
				
			|||||||
@ -1,5 +1,7 @@
 | 
				
			|||||||
 | 
					import base64
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import tempfile
 | 
					import tempfile
 | 
				
			||||||
 | 
					from io import BytesIO
 | 
				
			||||||
from pathlib import PurePath
 | 
					from pathlib import PurePath
 | 
				
			||||||
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
 | 
					from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -79,11 +81,17 @@ def save_elements(
 | 
				
			|||||||
    pdf_image_dpi: int,
 | 
					    pdf_image_dpi: int,
 | 
				
			||||||
    filename: str = "",
 | 
					    filename: str = "",
 | 
				
			||||||
    file: Optional[Union[bytes, BinaryIO]] = None,
 | 
					    file: Optional[Union[bytes, BinaryIO]] = None,
 | 
				
			||||||
 | 
					    is_image: bool = False,
 | 
				
			||||||
 | 
					    extract_to_payload: bool = False,
 | 
				
			||||||
    output_dir_path: Optional[str] = None,
 | 
					    output_dir_path: Optional[str] = None,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Extract and save images from the page. This method iterates through the layout elements
 | 
					    Saves specific elements from a PDF as images either to a directory or embeds them in the
 | 
				
			||||||
    of the page, identifies image regions, and extracts and saves them as separate image files.
 | 
					    element's payload.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This function processes a list of elements partitioned from a PDF file. For each element of
 | 
				
			||||||
 | 
					    a specified category, it extracts and saves the image. The images can either be saved to
 | 
				
			||||||
 | 
					    a specified directory or embedded into the element's payload as a base64-encoded string.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if not output_dir_path:
 | 
					    if not output_dir_path:
 | 
				
			||||||
@ -91,14 +99,25 @@ def save_elements(
 | 
				
			|||||||
    os.makedirs(output_dir_path, exist_ok=True)
 | 
					    os.makedirs(output_dir_path, exist_ok=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with tempfile.TemporaryDirectory() as temp_dir:
 | 
					    with tempfile.TemporaryDirectory() as temp_dir:
 | 
				
			||||||
        _image_paths = convert_pdf_to_image(
 | 
					        if is_image:
 | 
				
			||||||
            filename,
 | 
					            if file is None:
 | 
				
			||||||
            file,
 | 
					                image_paths = [filename]
 | 
				
			||||||
            pdf_image_dpi,
 | 
					            else:
 | 
				
			||||||
            output_folder=temp_dir,
 | 
					                if hasattr(file, "seek"):
 | 
				
			||||||
            path_only=True,
 | 
					                    file.seek(0)
 | 
				
			||||||
        )
 | 
					                temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
 | 
				
			||||||
        image_paths = cast(List[str], _image_paths)
 | 
					                temp_file.write(file.read() if hasattr(file, "read") else file)
 | 
				
			||||||
 | 
					                temp_file.flush()
 | 
				
			||||||
 | 
					                image_paths = [temp_file.name]
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            _image_paths = convert_pdf_to_image(
 | 
				
			||||||
 | 
					                filename,
 | 
				
			||||||
 | 
					                file,
 | 
				
			||||||
 | 
					                pdf_image_dpi,
 | 
				
			||||||
 | 
					                output_folder=temp_dir,
 | 
				
			||||||
 | 
					                path_only=True,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            image_paths = cast(List[str], _image_paths)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        figure_number = 0
 | 
					        figure_number = 0
 | 
				
			||||||
        for el in elements:
 | 
					        for el in elements:
 | 
				
			||||||
@ -124,9 +143,17 @@ def save_elements(
 | 
				
			|||||||
                image_path = image_paths[page_number - 1]
 | 
					                image_path = image_paths[page_number - 1]
 | 
				
			||||||
                image = Image.open(image_path)
 | 
					                image = Image.open(image_path)
 | 
				
			||||||
                cropped_image = image.crop((x1, y1, x2, y2))
 | 
					                cropped_image = image.crop((x1, y1, x2, y2))
 | 
				
			||||||
                write_image(cropped_image, output_f_path)
 | 
					                if extract_to_payload:
 | 
				
			||||||
                # add image path to element metadata
 | 
					                    buffered = BytesIO()
 | 
				
			||||||
                el.metadata.image_path = output_f_path
 | 
					                    cropped_image.save(buffered, format="JPEG")
 | 
				
			||||||
 | 
					                    img_base64 = base64.b64encode(buffered.getvalue())
 | 
				
			||||||
 | 
					                    img_base64_str = img_base64.decode()
 | 
				
			||||||
 | 
					                    el.metadata.image_base64 = img_base64_str
 | 
				
			||||||
 | 
					                    el.metadata.image_mime_type = "image/jpeg"
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    write_image(cropped_image, output_f_path)
 | 
				
			||||||
 | 
					                    # add image path to element metadata
 | 
				
			||||||
 | 
					                    el.metadata.image_path = output_f_path
 | 
				
			||||||
            except (ValueError, IOError):
 | 
					            except (ValueError, IOError):
 | 
				
			||||||
                logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)
 | 
					                logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user