unstructured/test_unstructured/partition/pdf_image/test_pdf_image_utils.py

import base64
import io
import os
import tempfile
from unittest.mock import MagicMock, patch

import numpy as np
import pytest
from PIL import Image as PILImg

from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table
from unstructured.partition.pdf_image import pdf_image_utils


@pytest.mark.parametrize("image_type", ["pil", "numpy_array"])
def test_write_image(image_type):
    mock_pil_image = PILImg.new("RGB", (50, 50))
    mock_numpy_image = np.zeros((50, 50, 3), np.uint8)

    image_map = {
        "pil": mock_pil_image,
        "numpy_array": mock_numpy_image,
    }
    image = image_map[image_type]

    with tempfile.TemporaryDirectory() as tmpdir:
        output_image_path = os.path.join(tmpdir, "test_image.jpg")
        pdf_image_utils.write_image(image, output_image_path)
        assert os.path.exists(output_image_path)

        # Additional check to see if the written image can be read
        read_image = PILImg.open(output_image_path)
        assert read_image is not None


@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("path_only", [True, False])
def test_convert_pdf_to_image(file_mode, path_only):
    filename = example_doc_path("pdf/embedded-images.pdf")
    with tempfile.TemporaryDirectory() as tmpdir:
        if file_mode == "filename":
            images = pdf_image_utils.convert_pdf_to_image(
                filename=filename,
                file=None,
                output_folder=tmpdir,
                path_only=path_only,
            )
        else:
            with open(filename, "rb") as f:
                images = pdf_image_utils.convert_pdf_to_image(
                    filename="",
                    file=f,
                    output_folder=tmpdir,
                    path_only=path_only,
                )

        if path_only:
            assert isinstance(images[0], str)
        else:
            assert isinstance(images[0], PILImg.Image)


def test_convert_pdf_to_image_raises_error():
    filename = example_doc_path("embedded-images.pdf")
    with pytest.raises(ValueError) as exc_info:
        pdf_image_utils.convert_pdf_to_image(filename=filename, path_only=True, output_folder=None)

    assert str(exc_info.value) == "output_folder must be specified if path_only is true"


@pytest.mark.parametrize(
    ("filename", "is_image"),
    [
        (example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
        (example_doc_path("img/layout-parser-paper-fast.jpg"), True),
        (example_doc_path("img/english-and-korean.png"), True),
    ],
)
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
@pytest.mark.parametrize("horizontal_padding", [0, 20])
@pytest.mark.parametrize("vertical_padding", [0, 10])
def test_save_elements(
    element_category_to_save,
    extract_image_block_to_payload,
    filename,
    is_image,
    horizontal_padding,
    vertical_padding,
    monkeypatch,
):
    if horizontal_padding > 0:
        monkeypatch.setenv("EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD", str(horizontal_padding))
    if vertical_padding > 0:
        monkeypatch.setenv("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", str(vertical_padding))
    with tempfile.TemporaryDirectory() as tmpdir:
        elements = [
            Image(
                text="Image Text 1",
                coordinates=((78, 86), (78, 519), (512, 519), (512, 86)),
                coordinate_system=PixelSpace(width=1575, height=1166),
                metadata=ElementMetadata(page_number=1),
            ),
            Image(
                text="Image Text 2",
                coordinates=((570, 86), (570, 519), (1003, 519), (1003, 86)),
                coordinate_system=PixelSpace(width=1575, height=1166),
                metadata=ElementMetadata(page_number=1),
            ),
            Image(
                text="Table 1",
                coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
                coordinate_system=PixelSpace(width=1575, height=1166),
                metadata=ElementMetadata(page_number=1),
            ),
        ]
        if not is_image:
            # add a page 2 element
            elements.append(
                Table(
                    text="Table 2",
                    coordinates=((1062, 86), (1062, 519), (1496, 519), (1496, 86)),
                    coordinate_system=PixelSpace(width=1575, height=1166),
                    metadata=ElementMetadata(page_number=2),
                ),
            )

        pdf_image_utils.save_elements(
            elements=elements,
            starting_page_number=1,
            element_category_to_save=element_category_to_save,
            pdf_image_dpi=200,
            filename=filename,
            is_image=is_image,
            output_dir_path=str(tmpdir),
            extract_image_block_to_payload=extract_image_block_to_payload,
        )

        saved_elements = [el for el in elements if el.category == element_category_to_save]
        for i, el in enumerate(saved_elements):
            basename = "table" if el.category == ElementType.TABLE else "figure"
            expected_image_path = os.path.join(
                str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
            )
            if extract_image_block_to_payload:
                assert isinstance(el.metadata.image_base64, str)
                assert isinstance(el.metadata.image_mime_type, str)
                image_bytes = base64.b64decode(el.metadata.image_base64)
                image = PILImg.open(io.BytesIO(image_bytes))
                x1, y1 = el.metadata.coordinates.points[0]
                x2, y2 = el.metadata.coordinates.points[2]
                width = x2 - x1
                height = y2 - y1
                assert image.width == width + 2 * horizontal_padding
                assert image.height == height + 2 * vertical_padding
                assert not el.metadata.image_path
                assert not os.path.isfile(expected_image_path)
            else:
                assert os.path.isfile(expected_image_path)
                image = PILImg.open(expected_image_path)
                x1, y1 = el.metadata.coordinates.points[0]
                x2, y2 = el.metadata.coordinates.points[2]
                width = x2 - x1
                height = y2 - y1
                assert image.width == width + 2 * horizontal_padding
                assert image.height == height + 2 * vertical_padding
                assert el.metadata.image_path == expected_image_path
                assert not el.metadata.image_base64
                assert not el.metadata.image_mime_type


@pytest.mark.parametrize("storage_enabled", [False, True])
def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
    monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", storage_enabled)
    with (
        patch("PIL.Image.open"),
        patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
        patch("unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image"),
        tempfile.TemporaryDirectory() as tmpdir,
    ):
        original_cwd = os.getcwd()
        os.chdir(tmpdir)
        pdf_image_utils.save_elements(
            elements=[],
            element_category_to_save="",
            starting_page_number=1,
            pdf_image_dpi=200,
            filename="dummy.pdf",
            output_dir_path=None,
        )

        # Verify that the images are saved in the expected directory
        if storage_enabled:
            from unstructured.partition.utils.config import env_config

            expected_output_dir = os.path.join(env_config.GLOBAL_WORKING_PROCESS_DIR, "figures")
        else:
            expected_output_dir = os.path.join(tmpdir, "figures")
        assert os.path.exists(expected_output_dir)
        assert os.path.isdir(expected_output_dir)
        os.chdir(original_cwd)


def test_write_image_raises_error():
    with pytest.raises(ValueError):
        pdf_image_utils.write_image("invalid_type", "test_image.jpg")


@pytest.mark.parametrize(
    ("text", "outcome"), [("", False), ("foo", True), (None, False), ("(cid:10)boo", False)]
)
def test_valid_text(text, outcome):
    assert pdf_image_utils.valid_text(text) == outcome


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("base", 0.0),
        ("", 0.0),
        ("(cid:2)", 1.0),
        ("(cid:1)a", 0.5),
        ("c(cid:1)ab", 0.25),
    ],
)
def test_cid_ratio(text, expected):
    assert pdf_image_utils.cid_ratio(text) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        ("base", False),
        ("(cid:2)", True),
        ("(cid:1234567890)", True),
        ("jkl;(cid:12)asdf", True),
    ],
)
def test_is_cid_present(text, expected):
    assert pdf_image_utils.is_cid_present(text) == expected


def test_pad_bbox():
    bbox = (100, 100, 200, 200)
    padding = (10, 20)  # Horizontal padding 10, Vertical padding 20
    expected = (90, 80, 210, 220)

    result = pdf_image_utils.pad_bbox(bbox, padding)
    assert result == expected


@pytest.mark.parametrize(
    ("input_types", "expected"),
    [
        (None, []),
        (["table", "image"], ["Table", "Image"]),
        (["unknown"], ["Unknown"]),
        (["Table", "image", "UnknOwn"], ["Table", "Image", "Unknown"]),
        (["NarrativeText", "narrativetext"], ["NarrativeText", "NarrativeText"]),
    ],
)
def test_check_element_types_to_extract(input_types, expected):
    assert pdf_image_utils.check_element_types_to_extract(input_types) == expected


def test_check_element_types_to_extract_raises_error():
    with pytest.raises(TypeError) as exc_info:
        pdf_image_utils.check_element_types_to_extract("not a list")
    assert "must be a list" in str(exc_info.value)


class MockPageLayout:
    def annotate(self, colors):
        return "mock_image"


class MockDocumentLayout:
    pages = [MockPageLayout(), MockPageLayout]


def test_annotate_layout_elements_with_image():
    inferred_layout = MockPageLayout()
    extracted_layout = MockPageLayout()
    output_basename = "test_page"
    page_number = 1

    # Check if images for both layouts were saved
    with (
        tempfile.TemporaryDirectory() as tmpdir,
        patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
    ):
        pdf_image_utils.annotate_layout_elements_with_image(
            inferred_page_layout=inferred_layout,
            extracted_page_layout=extracted_layout,
            output_dir_path=str(tmpdir),
            output_f_basename=output_basename,
            page_number=page_number,
        )

        expected_filenames = [
            f"{output_basename}_{page_number}_inferred.jpg",
            f"{output_basename}_{page_number}_extracted.jpg",
        ]
        actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
        for expected_filename in expected_filenames:
            assert any(expected_filename in actual_call for actual_call in actual_calls)

    # Check if only the inferred layout image was saved if extracted layout is None
    with (
        tempfile.TemporaryDirectory() as tmpdir,
        patch("unstructured.partition.pdf_image.pdf_image_utils.write_image") as mock_write_image,
    ):
        pdf_image_utils.annotate_layout_elements_with_image(
            inferred_page_layout=inferred_layout,
            extracted_page_layout=None,
            output_dir_path=str(tmpdir),
            output_f_basename=output_basename,
            page_number=page_number,
        )

        expected_filename = f"{output_basename}_{page_number}_inferred.jpg"
        actual_calls = [call.args[1] for call in mock_write_image.call_args_list]
        assert any(expected_filename in actual_call for actual_call in actual_calls)
        assert len(actual_calls) == 1  # Only one image should be saved


@pytest.mark.parametrize(
    ("filename", "is_image"),
    [
        (example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
        (example_doc_path("img/layout-parser-paper-fast.jpg"), True),
    ],
)
def test_annotate_layout_elements(filename, is_image):
    inferred_document_layout = MockDocumentLayout
    extracted_layout = [MagicMock(), MagicMock()]

    with (
        patch("PIL.Image.open"),
        patch(
            "unstructured.partition.pdf_image.pdf_image_utils.convert_pdf_to_image",
            return_value=["/path/to/image1.jpg", "/path/to/image2.jpg"],
        ) as mock_pdf2image,
        patch(
            "unstructured.partition.pdf_image.pdf_image_utils.annotate_layout_elements_with_image"
        ) as mock_annotate_layout_elements_with_image,
    ):
        pdf_image_utils.annotate_layout_elements(
            inferred_document_layout=inferred_document_layout,
            extracted_layout=extracted_layout,
            filename=filename,
            output_dir_path="/output",
            pdf_image_dpi=200,
            is_image=is_image,
        )
        if is_image:
            mock_annotate_layout_elements_with_image.assert_called_once()
        else:
            assert mock_annotate_layout_elements_with_image.call_count == len(
                mock_pdf2image.return_value
            )


def test_annotate_layout_elements_file_not_found_error():
    with pytest.raises(FileNotFoundError):
        pdf_image_utils.annotate_layout_elements(
            inferred_document_layout=MagicMock(),
            extracted_layout=[],
            filename="nonexistent.jpg",
            output_dir_path="/output",
            pdf_image_dpi=200,
            is_image=True,
        )


@pytest.mark.parametrize(
    ("text", "expected"),
    [("test\tco\x0cn\ftrol\ncharacter\rs\b", "test control characters"), ("\"'\\", "\"'\\")],
)
def test_remove_control_characters(text, expected):
    assert pdf_image_utils.remove_control_characters(text) == expected