Austin Walker e3417d7e98
fix: Fix for Pillow error when extracting PNG images (#3998)
When I tried to partition a PNG file and extract images, I got an error
from Pillow:

```
WARNING  unstructured:pdf_image_utils.py:230 Image Extraction Error: Skipping the failed image
Traceback (most recent call last):
  File "/Users/austin/.pyenv/versions/unstructured/lib/python3.10/site-packages/PIL/JpegImagePlugin.py", line 666, in _save
    rawmode = RAWMODE[im.mode]
KeyError: 'RGBA'
```

The issue is that a PNG has an additional layer that cannot be saved off
in jpeg format. We can fix this with a quick conversion. I added a png
test case that is now passing with this fix.
2025-05-08 21:57:05 +00:00

358 lines
12 KiB
Python

import os
import tempfile
from unittest.mock import MagicMock, patch
import numpy as np
import pytest
from PIL import Image as PILImg
from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table
from unstructured.partition.pdf_image import pdf_image_utils
@pytest.mark.parametrize("image_type", ["pil", "numpy_array"])
def test_write_image(image_type):
mock_pil_image = PILImg.new("RGB", (50, 50))
mock_numpy_image = np.zeros((50, 50, 3), np.uint8)
image_map = {
"pil": mock_pil_image,
"numpy_array": mock_numpy_image,
}
image = image_map[image_type]
with tempfile.TemporaryDirectory() as tmpdir:
output_image_path = os.path.join(tmpdir, "test_image.jpg")
pdf_image_utils.write_image(image, output_image_path)
assert os.path.exists(output_image_path)
# Additional check to see if the written image can be read
read_image = PILImg.open(output_image_path)
assert read_image is not None
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("path_only", [True, False])
def test_convert_pdf_to_image(
file_mode, path_only, filename=example_doc_path("pdf/embedded-images.pdf")
):
with tempfile.TemporaryDirectory() as tmpdir:
if file_mode == "filename":
images = pdf_image_utils.convert_pdf_to_image(
filename=filename,
file=None,
output_folder=tmpdir,
path_only=path_only,
)
else:
with open(filename, "rb") as f:
images = pdf_image_utils.convert_pdf_to_image(
filename="",
file=f,
output_folder=tmpdir,
path_only=path_only,
)
if path_only:
assert isinstance(images[0], str)
else:
assert isinstance(images[0], PILImg.Image)
def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-images.pdf")):
with pytest.raises(ValueError) as exc_info:
pdf_image_utils.convert_pdf_to_image(filename=filename, path_only=True, output_folder=None)
assert str(exc_info.value) == "output_folder must be specified if path_only is true"
@pytest.mark.parametrize(
("filename", "is_image"),
[
(example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
(example_doc_path("img/layout-parser-paper-fast.jpg"), True),
(example_doc_path("img/english-and-korean.png"), True),
],
)
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
def test_save_elements(
element_category_to_save,
extract_image_block_to_payload,
filename,
is_image,
):
with tempfile.TemporaryDirectory() as tmpdir:
elements = [
Image(
text="Image Text 1",
coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
Image(
text="Image Text 2",
coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
Image(
text="Table 1",
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
]
if not is_image:
# add a page 2 element
elements.append(
Table(
text="Table 2",
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=2),
),
)
pdf_image_utils.save_elements(
elements=elements,
starting_page_number=1,
element_category_to_save=element_category_to_save,
pdf_image_dpi=200,
filename=filename,
is_image=is_image,
output_dir_path=str(tmpdir),
extract_image_block_to_payload=extract_image_block_to_payload,
)
saved_elements = [el for el in elements if el.category == element_category_to_save]
for i, el in enumerate(saved_elements):
basename = "table" if el.category == ElementType.TABLE else "figure"
expected_image_path = os.path.join(
str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
)
if extract_image_block_to_payload:
assert isinstance(el.metadata.image_base64, str)
assert isinstance(el.metadata.image_mime_type, str)
assert not el.metadata.image_path
assert not os.path.isfile(expected_image_path)
else:
assert os.path.isfile(expected_image_path)
assert el.metadata.image_path == expected_image_path
assert not el.metadata.image_base64
assert not el.metadata.image_mime_type
@pytest.mark.parametrize("storage_enabled", [False, True])
def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", storage_enabled)
with (
patch("PIL.Image.open"),
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
patch("unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image"),
tempfile.TemporaryDirectory() as tmpdir,
):
original_cwd = os.getcwd()
os.chdir(tmpdir)
pdf_image_utils.save_elements(
elements=[],
element_category_to_save="",
starting_page_number=1,
pdf_image_dpi=200,
filename="dummy.pdf",
output_dir_path=None,
)
# Verify that the images are saved in the expected directory
if storage_enabled:
from unstructured.partition.utils.config import env_config
expected_output_dir = os.path.join(env_config.GLOBAL_WORKING_PROCESS_DIR, "figures")
else:
expected_output_dir = os.path.join(tmpdir, "figures")
assert os.path.exists(expected_output_dir)
assert os.path.isdir(expected_output_dir)
os.chdir(original_cwd)
def test_write_image_raises_error():
with pytest.raises(ValueError):
pdf_image_utils.write_image("invalid_type", "test_image.jpg")
@pytest.mark.parametrize(
("text", "outcome"), [("", False), ("foo", True), (None, False), ("(cid:10)boo", False)]
)
def test_valid_text(text, outcome):
assert pdf_image_utils.valid_text(text) == outcome
@pytest.mark.parametrize(
("text", "expected"),
[
("base", 0.0),
("", 0.0),
("(cid:2)", 1.0),
("(cid:1)a", 0.5),
("c(cid:1)ab", 0.25),
],
)
def test_cid_ratio(text, expected):
assert pdf_image_utils.cid_ratio(text) == expected
@pytest.mark.parametrize(
("text", "expected"),
[
("base", False),
("(cid:2)", True),
("(cid:1234567890)", True),
("jkl;(cid:12)asdf", True),
],
)
def test_is_cid_present(text, expected):
assert pdf_image_utils.is_cid_present(text) == expected
def test_pad_bbox():
bbox = (100, 100, 200, 200)
padding = (10, 20) # Horizontal padding 10, Vertical padding 20
expected = (90, 80, 210, 220)
result = pdf_image_utils.pad_bbox(bbox, padding)
assert result == expected
@pytest.mark.parametrize(
("input_types", "expected"),
[
(None, []),
(["table", "image"], ["Table", "Image"]),
(["unknown"], ["Unknown"]),
(["Table", "image", "UnknOwn"], ["Table", "Image", "Unknown"]),
(["NarrativeText", "narrativetext"], ["NarrativeText", "NarrativeText"]),
],
)
def test_check_element_types_to_extract(input_types, expected):
assert pdf_image_utils.check_element_types_to_extract(input_types) == expected
def test_check_element_types_to_extract_raises_error():
with pytest.raises(TypeError) as exc_info:
pdf_image_utils.check_element_types_to_extract("not a list")
assert "must be a list" in str(exc_info.value)
class MockPageLayout:
def annotate(self, colors):
return "mock_image"
class MockDocumentLayout:
pages = [MockPageLayout(), MockPageLayout]
def test_annotate_layout_elements_with_image():
inferred_layout = MockPageLayout()
extracted_layout = MockPageLayout()
output_basename = "test_page"
page_number = 1
# Check if images for both layouts were saved
with (
tempfile.TemporaryDirectory() as tmpdir,
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
):
pdf_image_utils.annotate_layout_elements_with_image(
inferred_page_layout=inferred_layout,
extracted_page_layout=extracted_layout,
output_dir_path=str(tmpdir),
output_f_basename=output_basename,
page_number=page_number,
)
expected_filenames = [
f"{output_basename}_{page_number}_inferred.jpg",
f"{output_basename}_{page_number}_extracted.jpg",
]
actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
for expected_filename in expected_filenames:
assert any(expected_filename in actual_call for actual_call in actual_calls)
# Check if only the inferred layout image was saved if extracted layout is None
with (
tempfile.TemporaryDirectory() as tmpdir,
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
):
pdf_image_utils.annotate_layout_elements_with_image(
inferred_page_layout=inferred_layout,
extracted_page_layout=None,
output_dir_path=str(tmpdir),
output_f_basename=output_basename,
page_number=page_number,
)
expected_filename = f"{output_basename}_{page_number}_inferred.jpg"
actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
assert any(expected_filename in actual_call for actual_call in actual_calls)
assert len(actual_calls) == 1 # Only one image should be saved
@pytest.mark.parametrize(
("filename", "is_image"),
[
(example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
(example_doc_path("img/layout-parser-paper-fast.jpg"), True),
],
)
def test_annotate_layout_elements(filename, is_image):
inferred_document_layout = MockDocumentLayout
extracted_layout = [MagicMock(), MagicMock()]
with (
patch("PIL.Image.open"),
patch(
"unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image",
return_value=["/path/to/image1.jpg", "/path/to/image2.jpg"],
) as mock_pdf2image,
patch(
"unstructured.partition.pdf_image.pdf_image_utils.annotate_layout_elements_with_image"
) as mock_annotate_layout_elements_with_image,
):
pdf_image_utils.annotate_layout_elements(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
filename=filename,
output_dir_path="/output",
pdf_image_dpi=200,
is_image=is_image,
)
if is_image:
mock_annotate_layout_elements_with_image.assert_called_once()
else:
assert mock_annotate_layout_elements_with_image.call_count == len(
mock_pdf2image.return_value
)
def test_annotate_layout_elements_file_not_found_error():
with pytest.raises(FileNotFoundError):
pdf_image_utils.annotate_layout_elements(
inferred_document_layout=MagicMock(),
extracted_layout=[],
filename="nonexistent.jpg",
output_dir_path="/output",
pdf_image_dpi=200,
is_image=True,
)
@pytest.mark.parametrize(
("text", "expected"),
[("test\tco\x0cn\ftrol\ncharacter\rs\b", "test control characters"), ("\"'\\", "\"'\\")],
)
def test_remove_control_characters(text, expected):
assert pdf_image_utils.remove_control_characters(text) == expected