enhancement: allow setting image block crop padding parameter (#2415)

Closes #2320 . ### Summary In certain circumstances, adjusting the image block crop padding can improve image block extraction by preventing extracted image blocks from being clipped. ### Testing - PDF: [LM339-D_2-2.pdf](https://github.com/Unstructured-IO/unstructured/files/13968952/LM339-D_2-2.pdf) - Set two environment variables `EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD` and `EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD` (e.g. `EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD = 40`, `EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD = 20` ``` elements = partition_pdf( filename="LM339-D_2-2.pdf", extract_image_block_types=["image"], ) ```
2025-12-04 19:16:03 +00:00 · 2024-01-18 22:28:32 -08:00 · 2024-01-18 22:28:32 -08:00 · 7378a378f6
commit 7378a378f6
parent 1a305866d1
7 changed files with 70 additions and 22 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,8 +1,9 @@
-## 0.12.1-dev17
+## 0.12.1-dev18


 ### Enhancements

+* **Allow setting image block crop padding parameter** In certain circumstances, adjusting the image block crop padding can improve image block extraction by preventing extracted image blocks from being clipped.
 * **Add suport for bitmap images in `partition_image`** Adds support for `.bmp` files in
  `partition`, `partition_image`, and `detect_filetype`.
 * **Keep all image elements when using "hi_res" strategy** Previously, `Image` elements with small chunks of text were ignored unless the image block extraction parameters (`extract_images_in_pdf` or `extract_image_block_types`) were specified. Now, all image elements are kept regardless of whether the image block extraction parameters are specified.
--- a/docs/source/core/partition.rst
+++ b/docs/source/core/partition.rst
@ -634,7 +634,7 @@ The available strategies for PDFs are ``"auto"``, ``"hi_res"``, ``"ocr_only"``,

 * The ``"fast"`` strategy will extract the text using ``pdfminer`` and process the raw text with ``partition_text``. If the PDF text is not extractable, ``partition_pdf`` will fall back to ``"ocr_only"``. We recommend using the ``"fast"`` strategy in most cases where the PDF has extractable text.

-To extract images and elements as image blocks from a PDF, it is mandatory to set ``strategy="hi_res"`` when setting ``extract_images_in_pdf=True``. With this configuration, detected images are saved in a specified directory or encoded within the file. However, keep in mind that ``extract_images_in_pdf`` is being phased out in favor of ``extract_image_block_types``. This option allows you to specify types of images or elements, like "Image" or "Table". For integrating these images directly into web applications or APIs, ``extract_image_block_to_payload`` can be used to convert them into ``base64`` format, including details about the image type. Lastly, the ``extract_image_block_output_dir`` can be used to specify the filesystem path for saving the extracted images when not embedding them in payloads.
+To extract images and elements as image blocks from a PDF, it is mandatory to set ``strategy="hi_res"`` when setting ``extract_images_in_pdf=True``. With this configuration, detected images are saved in a specified directory or encoded within the file. However, keep in mind that ``extract_images_in_pdf`` is being phased out in favor of ``extract_image_block_types``. This option allows you to specify types of images or elements, like "Image" or "Table". If some extracted images have content clipped, you can adjust the padding by specifying two environment variables "EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD" and "EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD" (for example, EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD = 20, EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD = 10). For integrating these images directly into web applications or APIs, ``extract_image_block_to_payload`` can be used to convert them into ``base64`` format, including details about the image type. Lastly, the ``extract_image_block_output_dir`` can be used to specify the filesystem path for saving the extracted images when not embedding them in payloads.

 Examples:

--- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
+++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@ -132,3 +132,12 @@ def test_write_image_raises_error():
 )
 def test_valid_text(text, outcome):
    assert pdf_image_utils.valid_text(text) == outcome
+
+
+def test_pad_bbox():
+    bbox = (100, 100, 200, 200)
+    padding = (10, 20)  # Horizontal padding 10, Vertical padding 20
+    expected = (90, 80, 210, 220)
+
+    result = pdf_image_utils.pad_bbox(bbox, padding)
+    assert result == expected
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.12.1-dev17"  # pragma: no cover
+__version__ = "0.12.1-dev18"  # pragma: no cover
--- a/unstructured/partition/pdf_image/ocr.py
+++ b/unstructured/partition/pdf_image/ocr.py
@ -1,6 +1,5 @@
 import os
 import tempfile
-from copy import deepcopy
 from typing import BinaryIO, Dict, List, Optional, Union, cast

 import cv2
@ -24,7 +23,7 @@ from unstructured_pytesseract import Output

 from unstructured.documents.elements import ElementType
 from unstructured.logger import logger
-from unstructured.partition.pdf_image.pdf_image_utils import valid_text
+from unstructured.partition.pdf_image.pdf_image_utils import pad_element_bboxes, valid_text
 from unstructured.partition.utils.config import env_config
 from unstructured.partition.utils.constants import (
    IMAGE_COLOR_DEPTH,
@ -363,21 +362,6 @@ def get_layout_elements_from_ocr(
    return layout_elements


-def pad_element_bboxes(
-    element: "LayoutElement",
-    padding: Union[int, float],
-) -> "LayoutElement":
-    """Increases (or decreases, if padding is negative) the size of the bounding
-    boxes of the element by extending the boundary outward (resp. inward)"""
-
-    out_element = deepcopy(element)
-    out_element.bbox.x1 -= padding
-    out_element.bbox.x2 += padding
-    out_element.bbox.y1 -= padding
-    out_element.bbox.y2 += padding
-    return out_element
-
-
 def zoom_image(image: PILImage, zoom: float = 1) -> PILImage:
    """scale an image based on the zoom factor using cv2; the scaled image is post processed by
    dilation then erosion to improve edge sharpness for OCR tasks"""
--- a/unstructured/partition/pdf_image/pdf_image_utils.py
+++ b/unstructured/partition/pdf_image/pdf_image_utils.py
@ -1,9 +1,10 @@
 import base64
 import os
 import tempfile
+from copy import deepcopy
 from io import BytesIO
 from pathlib import PurePath
-from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
+from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast

 import cv2
 import numpy as np
@ -13,9 +14,11 @@ from PIL import Image
 from unstructured.documents.elements import ElementType
 from unstructured.logger import logger
 from unstructured.partition.common import convert_to_bytes
+from unstructured.partition.utils.config import env_config

 if TYPE_CHECKING:
    from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion
+    from unstructured_inference.inference.layoutelement import LayoutElement

    from unstructured.documents.elements import Element

@ -75,6 +78,38 @@ def convert_pdf_to_image(
    return images


+def pad_element_bboxes(
+    element: "LayoutElement",
+    padding: Union[int, float],
+) -> "LayoutElement":
+    """Increases (or decreases, if padding is negative) the size of the bounding
+    boxes of the element by extending the boundary outward (resp. inward)"""
+
+    out_element = deepcopy(element)
+    out_element.bbox.x1 -= padding
+    out_element.bbox.x2 += padding
+    out_element.bbox.y1 -= padding
+    out_element.bbox.y2 += padding
+
+    return out_element
+
+
+def pad_bbox(
+    bbox: Tuple[float, float, float, float],
+    padding: Tuple[Union[int, float], Union[int, float]],
+) -> Tuple[float, float, float, float]:
+    """Pads a bounding box (bbox) by a specified horizontal and vertical padding."""
+
+    x1, y1, x2, y2 = bbox
+    h_padding, v_padding = padding
+    x1 -= h_padding
+    x2 += h_padding
+    y1 -= v_padding
+    y2 += v_padding
+
+    return x1, y1, x2, y2
+
+
 def save_elements(
    elements: List["Element"],
    element_category_to_save: str,
@ -131,6 +166,11 @@ def save_elements(
            points = coordinates.points
            x1, y1 = points[0]
            x2, y2 = points[2]
+            h_padding = env_config.EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD
+            v_padding = env_config.EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD
+            padded_bbox = cast(
+                Tuple[int, int, int, int], pad_bbox((x1, y1, x2, y2), (h_padding, v_padding))
+            )
            page_number = el.metadata.page_number

            figure_number += 1
@ -142,7 +182,7 @@ def save_elements(
                )
                image_path = image_paths[page_number - 1]
                image = Image.open(image_path)
-                cropped_image = image.crop((x1, y1, x2, y2))
+                cropped_image = image.crop(padded_bbox)
                if extract_image_block_to_payload:
                    buffered = BytesIO()
                    cropped_image.save(buffered, format="JPEG")
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@ -79,5 +79,19 @@ class ENVConfig:
        """
        return self._get_string("OCR_AGENT", OCR_AGENT_TESSERACT)

+    @property
+    def EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD(self) -> int:
+        """extra image block content to add around an identified element(`Image`, `Table`) region
+        horizontally; measured in pixels
+        """
+        return self._get_int("EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD", 0)
+
+    @property
+    def EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD(self) -> int:
+        """extra image block content to add around an identified element(`Image`, `Table`) region
+        vertically; measured in pixels
+        """
+        return self._get_int("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", 0)
+

 env_config = ENVConfig()