| 
									
										
										
										
											2023-12-05 10:22:29 -08:00
										 |  |  | import os | 
					
						
							|  |  |  | import tempfile | 
					
						
							| 
									
										
										
										
											2024-02-13 21:19:07 -08:00
										 |  |  | from unittest.mock import MagicMock, patch | 
					
						
							| 
									
										
										
										
											2023-12-05 10:22:29 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | import numpy as np | 
					
						
							|  |  |  | import pytest | 
					
						
							|  |  |  | from PIL import Image as PILImg | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from test_unstructured.unit_utils import example_doc_path | 
					
						
							|  |  |  | from unstructured.documents.coordinates import PixelSpace | 
					
						
							| 
									
										
										
											
												Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										 
											2023-12-26 21:39:01 -08:00
										 |  |  | from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table | 
					
						
							| 
									
										
										
										
											2023-12-05 10:22:29 -08:00
										 |  |  | from unstructured.partition.pdf_image import pdf_image_utils | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize("image_type", ["pil", "numpy_array"]) | 
					
						
							|  |  |  | def test_write_image(image_type): | 
					
						
							|  |  |  |     mock_pil_image = PILImg.new("RGB", (50, 50)) | 
					
						
							|  |  |  |     mock_numpy_image = np.zeros((50, 50, 3), np.uint8) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     image_map = { | 
					
						
							|  |  |  |         "pil": mock_pil_image, | 
					
						
							|  |  |  |         "numpy_array": mock_numpy_image, | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     image = image_map[image_type] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with tempfile.TemporaryDirectory() as tmpdir: | 
					
						
							|  |  |  |         output_image_path = os.path.join(tmpdir, "test_image.jpg") | 
					
						
							|  |  |  |         pdf_image_utils.write_image(image, output_image_path) | 
					
						
							|  |  |  |         assert os.path.exists(output_image_path) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Additional check to see if the written image can be read | 
					
						
							|  |  |  |         read_image = PILImg.open(output_image_path) | 
					
						
							|  |  |  |         assert read_image is not None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize("file_mode", ["filename", "rb"]) | 
					
						
							|  |  |  | @pytest.mark.parametrize("path_only", [True, False]) | 
					
						
							|  |  |  | def test_convert_pdf_to_image( | 
					
						
							|  |  |  |     file_mode, path_only, filename=example_doc_path("embedded-images.pdf") | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     with tempfile.TemporaryDirectory() as tmpdir: | 
					
						
							|  |  |  |         if file_mode == "filename": | 
					
						
							|  |  |  |             images = pdf_image_utils.convert_pdf_to_image( | 
					
						
							|  |  |  |                 filename=filename, | 
					
						
							|  |  |  |                 file=None, | 
					
						
							|  |  |  |                 output_folder=tmpdir, | 
					
						
							|  |  |  |                 path_only=path_only, | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             with open(filename, "rb") as f: | 
					
						
							|  |  |  |                 images = pdf_image_utils.convert_pdf_to_image( | 
					
						
							|  |  |  |                     filename="", | 
					
						
							|  |  |  |                     file=f, | 
					
						
							|  |  |  |                     output_folder=tmpdir, | 
					
						
							|  |  |  |                     path_only=path_only, | 
					
						
							|  |  |  |                 ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if path_only: | 
					
						
							|  |  |  |             assert isinstance(images[0], str) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             assert isinstance(images[0], PILImg.Image) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-13 21:19:07 -08:00
										 |  |  | def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-images.pdf")): | 
					
						
							|  |  |  |     with pytest.raises(ValueError) as exc_info: | 
					
						
							|  |  |  |         pdf_image_utils.convert_pdf_to_image(filename=filename, path_only=True, output_folder=None) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert str(exc_info.value) == "output_folder must be specified if path_only is true" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("filename", "is_image"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         (example_doc_path("layout-parser-paper-fast.pdf"), False), | 
					
						
							|  |  |  |         (example_doc_path("layout-parser-paper-fast.jpg"), True), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
											
												Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										 
											2023-12-26 21:39:01 -08:00
										 |  |  | @pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE]) | 
					
						
							| 
									
										
										
										
											2024-01-04 09:52:00 -08:00
										 |  |  | @pytest.mark.parametrize("extract_image_block_to_payload", [False, True]) | 
					
						
							| 
									
										
										
											
												Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										 
											2023-12-26 21:39:01 -08:00
										 |  |  | def test_save_elements( | 
					
						
							|  |  |  |     element_category_to_save, | 
					
						
							| 
									
										
										
										
											2024-01-04 09:52:00 -08:00
										 |  |  |     extract_image_block_to_payload, | 
					
						
							| 
									
										
										
										
											2024-02-13 21:19:07 -08:00
										 |  |  |     filename, | 
					
						
							|  |  |  |     is_image, | 
					
						
							| 
									
										
										
											
												Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										 
											2023-12-26 21:39:01 -08:00
										 |  |  | ): | 
					
						
							| 
									
										
										
										
											2023-12-05 10:22:29 -08:00
										 |  |  |     with tempfile.TemporaryDirectory() as tmpdir: | 
					
						
							|  |  |  |         elements = [ | 
					
						
							|  |  |  |             Image( | 
					
						
							| 
									
										
										
										
											2024-02-13 21:19:07 -08:00
										 |  |  |                 text="Image Text 1", | 
					
						
							| 
									
										
										
											
												Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										 
											2023-12-26 21:39:01 -08:00
										 |  |  |                 coordinates=((78, 86), (78, 519), (512, 519), (512, 86)), | 
					
						
							| 
									
										
										
										
											2023-12-05 10:22:29 -08:00
										 |  |  |                 coordinate_system=PixelSpace(width=1575, height=1166), | 
					
						
							|  |  |  |                 metadata=ElementMetadata(page_number=1), | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |             Image( | 
					
						
							| 
									
										
										
										
											2024-02-13 21:19:07 -08:00
										 |  |  |                 text="Image Text 2", | 
					
						
							| 
									
										
										
											
												Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										 
											2023-12-26 21:39:01 -08:00
										 |  |  |                 coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)), | 
					
						
							| 
									
										
										
										
											2023-12-05 10:22:29 -08:00
										 |  |  |                 coordinate_system=PixelSpace(width=1575, height=1166), | 
					
						
							|  |  |  |                 metadata=ElementMetadata(page_number=1), | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |             Image( | 
					
						
							| 
									
										
										
										
											2024-02-13 21:19:07 -08:00
										 |  |  |                 text="Table 1", | 
					
						
							| 
									
										
										
											
												Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										 
											2023-12-26 21:39:01 -08:00
										 |  |  |                 coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)), | 
					
						
							| 
									
										
										
										
											2023-12-05 10:22:29 -08:00
										 |  |  |                 coordinate_system=PixelSpace(width=1575, height=1166), | 
					
						
							|  |  |  |                 metadata=ElementMetadata(page_number=1), | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |         ] | 
					
						
							| 
									
										
										
										
											2024-02-13 21:19:07 -08:00
										 |  |  |         if not is_image: | 
					
						
							|  |  |  |             # add a page 2 element | 
					
						
							|  |  |  |             elements.append( | 
					
						
							|  |  |  |                 Table( | 
					
						
							|  |  |  |                     text="Table 2", | 
					
						
							|  |  |  |                     coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)), | 
					
						
							|  |  |  |                     coordinate_system=PixelSpace(width=1575, height=1166), | 
					
						
							|  |  |  |                     metadata=ElementMetadata(page_number=2), | 
					
						
							|  |  |  |                 ), | 
					
						
							|  |  |  |             ) | 
					
						
							| 
									
										
										
										
											2023-12-05 10:22:29 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-11 11:14:41 -08:00
										 |  |  |         pdf_image_utils.save_elements( | 
					
						
							|  |  |  |             elements=elements, | 
					
						
							| 
									
										
										
											
												Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										 
											2023-12-26 21:39:01 -08:00
										 |  |  |             element_category_to_save=element_category_to_save, | 
					
						
							| 
									
										
										
										
											2023-12-11 11:14:41 -08:00
										 |  |  |             pdf_image_dpi=200, | 
					
						
							|  |  |  |             filename=filename, | 
					
						
							| 
									
										
										
										
											2024-02-13 21:19:07 -08:00
										 |  |  |             is_image=is_image, | 
					
						
							| 
									
										
										
										
											2023-12-11 11:14:41 -08:00
										 |  |  |             output_dir_path=str(tmpdir), | 
					
						
							| 
									
										
										
										
											2024-01-04 09:52:00 -08:00
										 |  |  |             extract_image_block_to_payload=extract_image_block_to_payload, | 
					
						
							| 
									
										
										
										
											2023-12-05 10:22:29 -08:00
										 |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
											
												Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										 
											2023-12-26 21:39:01 -08:00
										 |  |  |         saved_elements = [el for el in elements if el.category == element_category_to_save] | 
					
						
							|  |  |  |         for i, el in enumerate(saved_elements): | 
					
						
							|  |  |  |             basename = "table" if el.category == ElementType.TABLE else "figure" | 
					
						
							| 
									
										
										
										
											2023-12-05 10:22:29 -08:00
										 |  |  |             expected_image_path = os.path.join( | 
					
						
							| 
									
										
										
											
												Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										 
											2023-12-26 21:39:01 -08:00
										 |  |  |                 str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg" | 
					
						
							| 
									
										
										
										
											2023-12-05 10:22:29 -08:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-01-04 09:52:00 -08:00
										 |  |  |             if extract_image_block_to_payload: | 
					
						
							| 
									
										
										
											
												Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    extract_element_types=["Image", "Table"],
    extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
    filename="example-docs/embedded-images-tables.pdf",
    strategy="hi_res",
    pdf_extract_element_types=["Image", "Table"],
    pdf_extract_to_payload=True,
)
```
											
										 
											2023-12-26 21:39:01 -08:00
										 |  |  |                 assert isinstance(el.metadata.image_base64, str) | 
					
						
							|  |  |  |                 assert isinstance(el.metadata.image_mime_type, str) | 
					
						
							|  |  |  |                 assert not el.metadata.image_path | 
					
						
							|  |  |  |                 assert not os.path.isfile(expected_image_path) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 assert os.path.isfile(expected_image_path) | 
					
						
							|  |  |  |                 assert el.metadata.image_path == expected_image_path | 
					
						
							|  |  |  |                 assert not el.metadata.image_base64 | 
					
						
							|  |  |  |                 assert not el.metadata.image_mime_type | 
					
						
							| 
									
										
										
										
											2023-12-05 10:22:29 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-13 21:19:07 -08:00
										 |  |  | def test_save_elements_with_output_dir_path_none(): | 
					
						
							|  |  |  |     with ( | 
					
						
							|  |  |  |         patch("PIL.Image.open"), | 
					
						
							|  |  |  |         patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"), | 
					
						
							|  |  |  |         patch("unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image"), | 
					
						
							|  |  |  |         tempfile.TemporaryDirectory() as tmpdir, | 
					
						
							|  |  |  |     ): | 
					
						
							|  |  |  |         original_cwd = os.getcwd() | 
					
						
							|  |  |  |         os.chdir(tmpdir) | 
					
						
							|  |  |  |         pdf_image_utils.save_elements( | 
					
						
							|  |  |  |             elements=[], | 
					
						
							|  |  |  |             element_category_to_save="", | 
					
						
							|  |  |  |             pdf_image_dpi=200, | 
					
						
							|  |  |  |             filename="dummy.pdf", | 
					
						
							|  |  |  |             output_dir_path=None, | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Verify that the images are saved in the expected directory | 
					
						
							|  |  |  |         expected_output_dir = os.path.join(tmpdir, "figures") | 
					
						
							|  |  |  |         assert os.path.exists(expected_output_dir) | 
					
						
							|  |  |  |         assert os.path.isdir(expected_output_dir) | 
					
						
							|  |  |  |         os.chdir(original_cwd) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-05 10:22:29 -08:00
										 |  |  | def test_write_image_raises_error(): | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         pdf_image_utils.write_image("invalid_type", "test_image.jpg") | 
					
						
							| 
									
										
										
										
											2023-12-14 00:49:23 -06:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("text", "outcome"), [("", False), ("foo", True), (None, False), ("(cid:10)boo", False)] | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_valid_text(text, outcome): | 
					
						
							|  |  |  |     assert pdf_image_utils.valid_text(text) == outcome | 
					
						
							| 
									
										
										
										
											2024-01-18 22:28:32 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_pad_bbox(): | 
					
						
							|  |  |  |     bbox = (100, 100, 200, 200) | 
					
						
							|  |  |  |     padding = (10, 20)  # Horizontal padding 10, Vertical padding 20 | 
					
						
							|  |  |  |     expected = (90, 80, 210, 220) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     result = pdf_image_utils.pad_bbox(bbox, padding) | 
					
						
							|  |  |  |     assert result == expected | 
					
						
							| 
									
										
										
										
											2024-02-13 21:19:07 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("input_types", "expected"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         (None, []), | 
					
						
							|  |  |  |         (["table", "image"], ["Table", "Image"]), | 
					
						
							|  |  |  |         (["unknown"], ["Unknown"]), | 
					
						
							|  |  |  |         (["Table", "image", "UnknOwn"], ["Table", "Image", "Unknown"]), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_check_element_types_to_extract(input_types, expected): | 
					
						
							|  |  |  |     assert pdf_image_utils.check_element_types_to_extract(input_types) == expected | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_check_element_types_to_extract_raises_error(): | 
					
						
							|  |  |  |     with pytest.raises(TypeError) as exc_info: | 
					
						
							|  |  |  |         pdf_image_utils.check_element_types_to_extract("not a list") | 
					
						
							|  |  |  |     assert "must be a list" in str(exc_info.value) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class MockPageLayout: | 
					
						
							|  |  |  |     def annotate(self, colors): | 
					
						
							|  |  |  |         return "mock_image" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class MockDocumentLayout: | 
					
						
							|  |  |  |     pages = [MockPageLayout(), MockPageLayout] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_annotate_layout_elements_with_image(): | 
					
						
							|  |  |  |     inferred_layout = MockPageLayout() | 
					
						
							|  |  |  |     extracted_layout = MockPageLayout() | 
					
						
							|  |  |  |     output_basename = "test_page" | 
					
						
							|  |  |  |     page_number = 1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Check if images for both layouts were saved | 
					
						
							|  |  |  |     with ( | 
					
						
							|  |  |  |         tempfile.TemporaryDirectory() as tmpdir, | 
					
						
							|  |  |  |         patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image, | 
					
						
							|  |  |  |     ): | 
					
						
							|  |  |  |         pdf_image_utils.annotate_layout_elements_with_image( | 
					
						
							|  |  |  |             inferred_page_layout=inferred_layout, | 
					
						
							|  |  |  |             extracted_page_layout=extracted_layout, | 
					
						
							|  |  |  |             output_dir_path=str(tmpdir), | 
					
						
							|  |  |  |             output_f_basename=output_basename, | 
					
						
							|  |  |  |             page_number=page_number, | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         expected_filenames = [ | 
					
						
							|  |  |  |             f"{output_basename}_{page_number}_inferred.jpg", | 
					
						
							|  |  |  |             f"{output_basename}_{page_number}_extracted.jpg", | 
					
						
							|  |  |  |         ] | 
					
						
							|  |  |  |         actual_calls = [call.args[1] for call in mock_write_image.call_args_list] | 
					
						
							|  |  |  |         for expected_filename in expected_filenames: | 
					
						
							|  |  |  |             assert any(expected_filename in actual_call for actual_call in actual_calls) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Check if only the inferred layout image was saved if extracted layout is None | 
					
						
							|  |  |  |     with ( | 
					
						
							|  |  |  |         tempfile.TemporaryDirectory() as tmpdir, | 
					
						
							|  |  |  |         patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image, | 
					
						
							|  |  |  |     ): | 
					
						
							|  |  |  |         pdf_image_utils.annotate_layout_elements_with_image( | 
					
						
							|  |  |  |             inferred_page_layout=inferred_layout, | 
					
						
							|  |  |  |             extracted_page_layout=None, | 
					
						
							|  |  |  |             output_dir_path=str(tmpdir), | 
					
						
							|  |  |  |             output_f_basename=output_basename, | 
					
						
							|  |  |  |             page_number=page_number, | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         expected_filename = f"{output_basename}_{page_number}_inferred.jpg" | 
					
						
							|  |  |  |         actual_calls = [call.args[1] for call in mock_write_image.call_args_list] | 
					
						
							|  |  |  |         assert any(expected_filename in actual_call for actual_call in actual_calls) | 
					
						
							|  |  |  |         assert len(actual_calls) == 1  # Only one image should be saved | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("filename", "is_image"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         (example_doc_path("layout-parser-paper-fast.pdf"), False), | 
					
						
							|  |  |  |         (example_doc_path("layout-parser-paper-fast.jpg"), True), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_annotate_layout_elements(filename, is_image): | 
					
						
							|  |  |  |     inferred_document_layout = MockDocumentLayout | 
					
						
							|  |  |  |     extracted_layout = [MagicMock(), MagicMock()] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     with ( | 
					
						
							|  |  |  |         patch("PIL.Image.open"), | 
					
						
							|  |  |  |         patch( | 
					
						
							|  |  |  |             "unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image", | 
					
						
							|  |  |  |             return_value=["/path/to/image1.jpg", "/path/to/image2.jpg"], | 
					
						
							|  |  |  |         ) as mock_pdf2image, | 
					
						
							|  |  |  |         patch( | 
					
						
							|  |  |  |             "unstructured.partition.pdf_image.pdf_image_utils.annotate_layout_elements_with_image" | 
					
						
							|  |  |  |         ) as mock_annotate_layout_elements_with_image, | 
					
						
							|  |  |  |     ): | 
					
						
							|  |  |  |         pdf_image_utils.annotate_layout_elements( | 
					
						
							|  |  |  |             inferred_document_layout=inferred_document_layout, | 
					
						
							|  |  |  |             extracted_layout=extracted_layout, | 
					
						
							|  |  |  |             filename=filename, | 
					
						
							|  |  |  |             output_dir_path="/output", | 
					
						
							|  |  |  |             pdf_image_dpi=200, | 
					
						
							|  |  |  |             is_image=is_image, | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         if is_image: | 
					
						
							|  |  |  |             mock_annotate_layout_elements_with_image.assert_called_once() | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             assert mock_annotate_layout_elements_with_image.call_count == len( | 
					
						
							|  |  |  |                 mock_pdf2image.return_value | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_annotate_layout_elements_file_not_found_error(): | 
					
						
							|  |  |  |     with pytest.raises(FileNotFoundError): | 
					
						
							|  |  |  |         pdf_image_utils.annotate_layout_elements( | 
					
						
							|  |  |  |             inferred_document_layout=MagicMock(), | 
					
						
							|  |  |  |             extracted_layout=[], | 
					
						
							|  |  |  |             filename="nonexistent.jpg", | 
					
						
							|  |  |  |             output_dir_path="/output", | 
					
						
							|  |  |  |             pdf_image_dpi=200, | 
					
						
							|  |  |  |             is_image=True, | 
					
						
							|  |  |  |         ) |