qued 6c1bbb379c
test: add check crop box padding to save_elements test (#4123)
Updated `save_elements` test to check the behavior of the environment
variables `EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD` and
`EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD` that pad the crop box for image
extraction.

<!-- CURSOR_SUMMARY -->
---

> [!NOTE]
> Enhances save_elements tests to validate crop-box padding via env vars
and image dimensions for both payload and file outputs; bumps version
and updates changelog.
> 
> - **Tests (pdf_image_utils)**:
> - `test_save_elements` now parametrizes
`horizontal_padding`/`vertical_padding`, sets
`EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD` and
`EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD`, and asserts padded image
dimensions for both `extract_image_block_to_payload` paths (decoding
`image_base64` or reading saved file).
>   - Adds required imports (`base64`, `io`).
> - **Versioning**:
>   - Update `unstructured/__version__.py` to `0.18.21-dev0`.
>   - Add CHANGELOG entry noting the unit test enhancement.
> 
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
a23bf6a9e7fb67f35c2aa16cc409d2de92886db8. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
2025-11-18 22:01:50 +00:00

384 lines
14 KiB
Python

import base64
import io
import os
import tempfile
from unittest.mock import MagicMock, patch
import numpy as np
import pytest
from PIL import Image as PILImg
from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table
from unstructured.partition.pdf_image import pdf_image_utils
@pytest.mark.parametrize("image_type", ["pil", "numpy_array"])
def test_write_image(image_type):
mock_pil_image = PILImg.new("RGB", (50, 50))
mock_numpy_image = np.zeros((50, 50, 3), np.uint8)
image_map = {
"pil": mock_pil_image,
"numpy_array": mock_numpy_image,
}
image = image_map[image_type]
with tempfile.TemporaryDirectory() as tmpdir:
output_image_path = os.path.join(tmpdir, "test_image.jpg")
pdf_image_utils.write_image(image, output_image_path)
assert os.path.exists(output_image_path)
# Additional check to see if the written image can be read
read_image = PILImg.open(output_image_path)
assert read_image is not None
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("path_only", [True, False])
def test_convert_pdf_to_image(file_mode, path_only):
filename = example_doc_path("pdf/embedded-images.pdf")
with tempfile.TemporaryDirectory() as tmpdir:
if file_mode == "filename":
images = pdf_image_utils.convert_pdf_to_image(
filename=filename,
file=None,
output_folder=tmpdir,
path_only=path_only,
)
else:
with open(filename, "rb") as f:
images = pdf_image_utils.convert_pdf_to_image(
filename="",
file=f,
output_folder=tmpdir,
path_only=path_only,
)
if path_only:
assert isinstance(images[0], str)
else:
assert isinstance(images[0], PILImg.Image)
def test_convert_pdf_to_image_raises_error():
filename = example_doc_path("embedded-images.pdf")
with pytest.raises(ValueError) as exc_info:
pdf_image_utils.convert_pdf_to_image(filename=filename, path_only=True, output_folder=None)
assert str(exc_info.value) == "output_folder must be specified if path_only is true"
@pytest.mark.parametrize(
("filename", "is_image"),
[
(example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
(example_doc_path("img/layout-parser-paper-fast.jpg"), True),
(example_doc_path("img/english-and-korean.png"), True),
],
)
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
@pytest.mark.parametrize("horizontal_padding", [0, 20])
@pytest.mark.parametrize("vertical_padding", [0, 10])
def test_save_elements(
element_category_to_save,
extract_image_block_to_payload,
filename,
is_image,
horizontal_padding,
vertical_padding,
monkeypatch,
):
if horizontal_padding > 0:
monkeypatch.setenv("EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD", str(horizontal_padding))
if vertical_padding > 0:
monkeypatch.setenv("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", str(vertical_padding))
with tempfile.TemporaryDirectory() as tmpdir:
elements = [
Image(
text="Image Text 1",
coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
Image(
text="Image Text 2",
coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
Image(
text="Table 1",
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=1),
),
]
if not is_image:
# add a page 2 element
elements.append(
Table(
text="Table 2",
coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
coordinate_system=PixelSpace(width=1575, height=1166),
metadata=ElementMetadata(page_number=2),
),
)
pdf_image_utils.save_elements(
elements=elements,
starting_page_number=1,
element_category_to_save=element_category_to_save,
pdf_image_dpi=200,
filename=filename,
is_image=is_image,
output_dir_path=str(tmpdir),
extract_image_block_to_payload=extract_image_block_to_payload,
)
saved_elements = [el for el in elements if el.category == element_category_to_save]
for i, el in enumerate(saved_elements):
basename = "table" if el.category == ElementType.TABLE else "figure"
expected_image_path = os.path.join(
str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
)
if extract_image_block_to_payload:
assert isinstance(el.metadata.image_base64, str)
assert isinstance(el.metadata.image_mime_type, str)
image_bytes = base64.b64decode(el.metadata.image_base64)
image = PILImg.open(io.BytesIO(image_bytes))
x1, y1 = el.metadata.coordinates.points[0]
x2, y2 = el.metadata.coordinates.points[2]
width = x2 - x1
height = y2 - y1
assert image.width == width + 2 * horizontal_padding
assert image.height == height + 2 * vertical_padding
assert not el.metadata.image_path
assert not os.path.isfile(expected_image_path)
else:
assert os.path.isfile(expected_image_path)
image = PILImg.open(expected_image_path)
x1, y1 = el.metadata.coordinates.points[0]
x2, y2 = el.metadata.coordinates.points[2]
width = x2 - x1
height = y2 - y1
assert image.width == width + 2 * horizontal_padding
assert image.height == height + 2 * vertical_padding
assert el.metadata.image_path == expected_image_path
assert not el.metadata.image_base64
assert not el.metadata.image_mime_type
@pytest.mark.parametrize("storage_enabled", [False, True])
def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", storage_enabled)
with (
patch("PIL.Image.open"),
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
patch("unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image"),
tempfile.TemporaryDirectory() as tmpdir,
):
original_cwd = os.getcwd()
os.chdir(tmpdir)
pdf_image_utils.save_elements(
elements=[],
element_category_to_save="",
starting_page_number=1,
pdf_image_dpi=200,
filename="dummy.pdf",
output_dir_path=None,
)
# Verify that the images are saved in the expected directory
if storage_enabled:
from unstructured.partition.utils.config import env_config
expected_output_dir = os.path.join(env_config.GLOBAL_WORKING_PROCESS_DIR, "figures")
else:
expected_output_dir = os.path.join(tmpdir, "figures")
assert os.path.exists(expected_output_dir)
assert os.path.isdir(expected_output_dir)
os.chdir(original_cwd)
def test_write_image_raises_error():
with pytest.raises(ValueError):
pdf_image_utils.write_image("invalid_type", "test_image.jpg")
@pytest.mark.parametrize(
("text", "outcome"), [("", False), ("foo", True), (None, False), ("(cid:10)boo", False)]
)
def test_valid_text(text, outcome):
assert pdf_image_utils.valid_text(text) == outcome
@pytest.mark.parametrize(
("text", "expected"),
[
("base", 0.0),
("", 0.0),
("(cid:2)", 1.0),
("(cid:1)a", 0.5),
("c(cid:1)ab", 0.25),
],
)
def test_cid_ratio(text, expected):
assert pdf_image_utils.cid_ratio(text) == expected
@pytest.mark.parametrize(
("text", "expected"),
[
("base", False),
("(cid:2)", True),
("(cid:1234567890)", True),
("jkl;(cid:12)asdf", True),
],
)
def test_is_cid_present(text, expected):
assert pdf_image_utils.is_cid_present(text) == expected
def test_pad_bbox():
bbox = (100, 100, 200, 200)
padding = (10, 20) # Horizontal padding 10, Vertical padding 20
expected = (90, 80, 210, 220)
result = pdf_image_utils.pad_bbox(bbox, padding)
assert result == expected
@pytest.mark.parametrize(
("input_types", "expected"),
[
(None, []),
(["table", "image"], ["Table", "Image"]),
(["unknown"], ["Unknown"]),
(["Table", "image", "UnknOwn"], ["Table", "Image", "Unknown"]),
(["NarrativeText", "narrativetext"], ["NarrativeText", "NarrativeText"]),
],
)
def test_check_element_types_to_extract(input_types, expected):
assert pdf_image_utils.check_element_types_to_extract(input_types) == expected
def test_check_element_types_to_extract_raises_error():
with pytest.raises(TypeError) as exc_info:
pdf_image_utils.check_element_types_to_extract("not a list")
assert "must be a list" in str(exc_info.value)
class MockPageLayout:
def annotate(self, colors):
return "mock_image"
class MockDocumentLayout:
pages = [MockPageLayout(), MockPageLayout]
def test_annotate_layout_elements_with_image():
inferred_layout = MockPageLayout()
extracted_layout = MockPageLayout()
output_basename = "test_page"
page_number = 1
# Check if images for both layouts were saved
with (
tempfile.TemporaryDirectory() as tmpdir,
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
):
pdf_image_utils.annotate_layout_elements_with_image(
inferred_page_layout=inferred_layout,
extracted_page_layout=extracted_layout,
output_dir_path=str(tmpdir),
output_f_basename=output_basename,
page_number=page_number,
)
expected_filenames = [
f"{output_basename}_{page_number}_inferred.jpg",
f"{output_basename}_{page_number}_extracted.jpg",
]
actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
for expected_filename in expected_filenames:
assert any(expected_filename in actual_call for actual_call in actual_calls)
# Check if only the inferred layout image was saved if extracted layout is None
with (
tempfile.TemporaryDirectory() as tmpdir,
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
):
pdf_image_utils.annotate_layout_elements_with_image(
inferred_page_layout=inferred_layout,
extracted_page_layout=None,
output_dir_path=str(tmpdir),
output_f_basename=output_basename,
page_number=page_number,
)
expected_filename = f"{output_basename}_{page_number}_inferred.jpg"
actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
assert any(expected_filename in actual_call for actual_call in actual_calls)
assert len(actual_calls) == 1 # Only one image should be saved
@pytest.mark.parametrize(
("filename", "is_image"),
[
(example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
(example_doc_path("img/layout-parser-paper-fast.jpg"), True),
],
)
def test_annotate_layout_elements(filename, is_image):
inferred_document_layout = MockDocumentLayout
extracted_layout = [MagicMock(), MagicMock()]
with (
patch("PIL.Image.open"),
patch(
"unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image",
return_value=["/path/to/image1.jpg", "/path/to/image2.jpg"],
) as mock_pdf2image,
patch(
"unstructured.partition.pdf_image.pdf_image_utils.annotate_layout_elements_with_image"
) as mock_annotate_layout_elements_with_image,
):
pdf_image_utils.annotate_layout_elements(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
filename=filename,
output_dir_path="/output",
pdf_image_dpi=200,
is_image=is_image,
)
if is_image:
mock_annotate_layout_elements_with_image.assert_called_once()
else:
assert mock_annotate_layout_elements_with_image.call_count == len(
mock_pdf2image.return_value
)
def test_annotate_layout_elements_file_not_found_error():
with pytest.raises(FileNotFoundError):
pdf_image_utils.annotate_layout_elements(
inferred_document_layout=MagicMock(),
extracted_layout=[],
filename="nonexistent.jpg",
output_dir_path="/output",
pdf_image_dpi=200,
is_image=True,
)
@pytest.mark.parametrize(
("text", "expected"),
[("test\tco\x0cn\ftrol\ncharacter\rs\b", "test control characters"), ("\"'\\", "\"'\\")],
)
def test_remove_control_characters(text, expected):
assert pdf_image_utils.remove_control_characters(text) == expected