mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

When I tried to partition a PNG file and extract images, I got an error from Pillow: ``` WARNING unstructured:pdf_image_utils.py:230 Image Extraction Error: Skipping the failed image Traceback (most recent call last): File "/Users/austin/.pyenv/versions/unstructured/lib/python3.10/site-packages/PIL/JpegImagePlugin.py", line 666, in _save rawmode = RAWMODE[im.mode] KeyError: 'RGBA' ``` The issue is that a PNG has an additional layer that cannot be saved off in jpeg format. We can fix this with a quick conversion. I added a png test case that is now passing with this fix.
358 lines
12 KiB
Python
358 lines
12 KiB
Python
import os
|
|
import tempfile
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import numpy as np
|
|
import pytest
|
|
from PIL import Image as PILImg
|
|
|
|
from test_unstructured.unit_utils import example_doc_path
|
|
from unstructured.documents.coordinates import PixelSpace
|
|
from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table
|
|
from unstructured.partition.pdf_image import pdf_image_utils
|
|
|
|
|
|
@pytest.mark.parametrize("image_type", ["pil", "numpy_array"])
|
|
def test_write_image(image_type):
|
|
mock_pil_image = PILImg.new("RGB", (50, 50))
|
|
mock_numpy_image = np.zeros((50, 50, 3), np.uint8)
|
|
|
|
image_map = {
|
|
"pil": mock_pil_image,
|
|
"numpy_array": mock_numpy_image,
|
|
}
|
|
image = image_map[image_type]
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
output_image_path = os.path.join(tmpdir, "test_image.jpg")
|
|
pdf_image_utils.write_image(image, output_image_path)
|
|
assert os.path.exists(output_image_path)
|
|
|
|
# Additional check to see if the written image can be read
|
|
read_image = PILImg.open(output_image_path)
|
|
assert read_image is not None
|
|
|
|
|
|
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
|
@pytest.mark.parametrize("path_only", [True, False])
|
|
def test_convert_pdf_to_image(
|
|
file_mode, path_only, filename=example_doc_path("pdf/embedded-images.pdf")
|
|
):
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
if file_mode == "filename":
|
|
images = pdf_image_utils.convert_pdf_to_image(
|
|
filename=filename,
|
|
file=None,
|
|
output_folder=tmpdir,
|
|
path_only=path_only,
|
|
)
|
|
else:
|
|
with open(filename, "rb") as f:
|
|
images = pdf_image_utils.convert_pdf_to_image(
|
|
filename="",
|
|
file=f,
|
|
output_folder=tmpdir,
|
|
path_only=path_only,
|
|
)
|
|
|
|
if path_only:
|
|
assert isinstance(images[0], str)
|
|
else:
|
|
assert isinstance(images[0], PILImg.Image)
|
|
|
|
|
|
def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-images.pdf")):
|
|
with pytest.raises(ValueError) as exc_info:
|
|
pdf_image_utils.convert_pdf_to_image(filename=filename, path_only=True, output_folder=None)
|
|
|
|
assert str(exc_info.value) == "output_folder must be specified if path_only is true"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "is_image"),
|
|
[
|
|
(example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
|
|
(example_doc_path("img/layout-parser-paper-fast.jpg"), True),
|
|
(example_doc_path("img/english-and-korean.png"), True),
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
|
|
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
|
def test_save_elements(
|
|
element_category_to_save,
|
|
extract_image_block_to_payload,
|
|
filename,
|
|
is_image,
|
|
):
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
elements = [
|
|
Image(
|
|
text="Image Text 1",
|
|
coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
|
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
|
metadata=ElementMetadata(page_number=1),
|
|
),
|
|
Image(
|
|
text="Image Text 2",
|
|
coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
|
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
|
metadata=ElementMetadata(page_number=1),
|
|
),
|
|
Image(
|
|
text="Table 1",
|
|
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
|
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
|
metadata=ElementMetadata(page_number=1),
|
|
),
|
|
]
|
|
if not is_image:
|
|
# add a page 2 element
|
|
elements.append(
|
|
Table(
|
|
text="Table 2",
|
|
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
|
|
coordinate_system=PixelSpace(width=1575, height=1166),
|
|
metadata=ElementMetadata(page_number=2),
|
|
),
|
|
)
|
|
|
|
pdf_image_utils.save_elements(
|
|
elements=elements,
|
|
starting_page_number=1,
|
|
element_category_to_save=element_category_to_save,
|
|
pdf_image_dpi=200,
|
|
filename=filename,
|
|
is_image=is_image,
|
|
output_dir_path=str(tmpdir),
|
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
|
)
|
|
|
|
saved_elements = [el for el in elements if el.category == element_category_to_save]
|
|
for i, el in enumerate(saved_elements):
|
|
basename = "table" if el.category == ElementType.TABLE else "figure"
|
|
expected_image_path = os.path.join(
|
|
str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
|
|
)
|
|
if extract_image_block_to_payload:
|
|
assert isinstance(el.metadata.image_base64, str)
|
|
assert isinstance(el.metadata.image_mime_type, str)
|
|
assert not el.metadata.image_path
|
|
assert not os.path.isfile(expected_image_path)
|
|
else:
|
|
assert os.path.isfile(expected_image_path)
|
|
assert el.metadata.image_path == expected_image_path
|
|
assert not el.metadata.image_base64
|
|
assert not el.metadata.image_mime_type
|
|
|
|
|
|
@pytest.mark.parametrize("storage_enabled", [False, True])
|
|
def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
|
|
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", storage_enabled)
|
|
with (
|
|
patch("PIL.Image.open"),
|
|
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
|
|
patch("unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image"),
|
|
tempfile.TemporaryDirectory() as tmpdir,
|
|
):
|
|
original_cwd = os.getcwd()
|
|
os.chdir(tmpdir)
|
|
pdf_image_utils.save_elements(
|
|
elements=[],
|
|
element_category_to_save="",
|
|
starting_page_number=1,
|
|
pdf_image_dpi=200,
|
|
filename="dummy.pdf",
|
|
output_dir_path=None,
|
|
)
|
|
|
|
# Verify that the images are saved in the expected directory
|
|
if storage_enabled:
|
|
from unstructured.partition.utils.config import env_config
|
|
|
|
expected_output_dir = os.path.join(env_config.GLOBAL_WORKING_PROCESS_DIR, "figures")
|
|
else:
|
|
expected_output_dir = os.path.join(tmpdir, "figures")
|
|
assert os.path.exists(expected_output_dir)
|
|
assert os.path.isdir(expected_output_dir)
|
|
os.chdir(original_cwd)
|
|
|
|
|
|
def test_write_image_raises_error():
|
|
with pytest.raises(ValueError):
|
|
pdf_image_utils.write_image("invalid_type", "test_image.jpg")
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text", "outcome"), [("", False), ("foo", True), (None, False), ("(cid:10)boo", False)]
|
|
)
|
|
def test_valid_text(text, outcome):
|
|
assert pdf_image_utils.valid_text(text) == outcome
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text", "expected"),
|
|
[
|
|
("base", 0.0),
|
|
("", 0.0),
|
|
("(cid:2)", 1.0),
|
|
("(cid:1)a", 0.5),
|
|
("c(cid:1)ab", 0.25),
|
|
],
|
|
)
|
|
def test_cid_ratio(text, expected):
|
|
assert pdf_image_utils.cid_ratio(text) == expected
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text", "expected"),
|
|
[
|
|
("base", False),
|
|
("(cid:2)", True),
|
|
("(cid:1234567890)", True),
|
|
("jkl;(cid:12)asdf", True),
|
|
],
|
|
)
|
|
def test_is_cid_present(text, expected):
|
|
assert pdf_image_utils.is_cid_present(text) == expected
|
|
|
|
|
|
def test_pad_bbox():
|
|
bbox = (100, 100, 200, 200)
|
|
padding = (10, 20) # Horizontal padding 10, Vertical padding 20
|
|
expected = (90, 80, 210, 220)
|
|
|
|
result = pdf_image_utils.pad_bbox(bbox, padding)
|
|
assert result == expected
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("input_types", "expected"),
|
|
[
|
|
(None, []),
|
|
(["table", "image"], ["Table", "Image"]),
|
|
(["unknown"], ["Unknown"]),
|
|
(["Table", "image", "UnknOwn"], ["Table", "Image", "Unknown"]),
|
|
(["NarrativeText", "narrativetext"], ["NarrativeText", "NarrativeText"]),
|
|
],
|
|
)
|
|
def test_check_element_types_to_extract(input_types, expected):
|
|
assert pdf_image_utils.check_element_types_to_extract(input_types) == expected
|
|
|
|
|
|
def test_check_element_types_to_extract_raises_error():
|
|
with pytest.raises(TypeError) as exc_info:
|
|
pdf_image_utils.check_element_types_to_extract("not a list")
|
|
assert "must be a list" in str(exc_info.value)
|
|
|
|
|
|
class MockPageLayout:
|
|
def annotate(self, colors):
|
|
return "mock_image"
|
|
|
|
|
|
class MockDocumentLayout:
|
|
pages = [MockPageLayout(), MockPageLayout]
|
|
|
|
|
|
def test_annotate_layout_elements_with_image():
|
|
inferred_layout = MockPageLayout()
|
|
extracted_layout = MockPageLayout()
|
|
output_basename = "test_page"
|
|
page_number = 1
|
|
|
|
# Check if images for both layouts were saved
|
|
with (
|
|
tempfile.TemporaryDirectory() as tmpdir,
|
|
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
|
|
):
|
|
pdf_image_utils.annotate_layout_elements_with_image(
|
|
inferred_page_layout=inferred_layout,
|
|
extracted_page_layout=extracted_layout,
|
|
output_dir_path=str(tmpdir),
|
|
output_f_basename=output_basename,
|
|
page_number=page_number,
|
|
)
|
|
|
|
expected_filenames = [
|
|
f"{output_basename}_{page_number}_inferred.jpg",
|
|
f"{output_basename}_{page_number}_extracted.jpg",
|
|
]
|
|
actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
|
|
for expected_filename in expected_filenames:
|
|
assert any(expected_filename in actual_call for actual_call in actual_calls)
|
|
|
|
# Check if only the inferred layout image was saved if extracted layout is None
|
|
with (
|
|
tempfile.TemporaryDirectory() as tmpdir,
|
|
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
|
|
):
|
|
pdf_image_utils.annotate_layout_elements_with_image(
|
|
inferred_page_layout=inferred_layout,
|
|
extracted_page_layout=None,
|
|
output_dir_path=str(tmpdir),
|
|
output_f_basename=output_basename,
|
|
page_number=page_number,
|
|
)
|
|
|
|
expected_filename = f"{output_basename}_{page_number}_inferred.jpg"
|
|
actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
|
|
assert any(expected_filename in actual_call for actual_call in actual_calls)
|
|
assert len(actual_calls) == 1 # Only one image should be saved
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "is_image"),
|
|
[
|
|
(example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
|
|
(example_doc_path("img/layout-parser-paper-fast.jpg"), True),
|
|
],
|
|
)
|
|
def test_annotate_layout_elements(filename, is_image):
|
|
inferred_document_layout = MockDocumentLayout
|
|
extracted_layout = [MagicMock(), MagicMock()]
|
|
|
|
with (
|
|
patch("PIL.Image.open"),
|
|
patch(
|
|
"unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image",
|
|
return_value=["/path/to/image1.jpg", "/path/to/image2.jpg"],
|
|
) as mock_pdf2image,
|
|
patch(
|
|
"unstructured.partition.pdf_image.pdf_image_utils.annotate_layout_elements_with_image"
|
|
) as mock_annotate_layout_elements_with_image,
|
|
):
|
|
pdf_image_utils.annotate_layout_elements(
|
|
inferred_document_layout=inferred_document_layout,
|
|
extracted_layout=extracted_layout,
|
|
filename=filename,
|
|
output_dir_path="/output",
|
|
pdf_image_dpi=200,
|
|
is_image=is_image,
|
|
)
|
|
if is_image:
|
|
mock_annotate_layout_elements_with_image.assert_called_once()
|
|
else:
|
|
assert mock_annotate_layout_elements_with_image.call_count == len(
|
|
mock_pdf2image.return_value
|
|
)
|
|
|
|
|
|
def test_annotate_layout_elements_file_not_found_error():
|
|
with pytest.raises(FileNotFoundError):
|
|
pdf_image_utils.annotate_layout_elements(
|
|
inferred_document_layout=MagicMock(),
|
|
extracted_layout=[],
|
|
filename="nonexistent.jpg",
|
|
output_dir_path="/output",
|
|
pdf_image_dpi=200,
|
|
is_image=True,
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text", "expected"),
|
|
[("test\tco\x0cn\ftrol\ncharacter\rs\b", "test control characters"), ("\"'\\", "\"'\\")],
|
|
)
|
|
def test_remove_control_characters(text, expected):
|
|
assert pdf_image_utils.remove_control_characters(text) == expected
|