enhancement: allow setting image block crop padding parameter (#2415)

Closes #2320 .

### Summary
In certain circumstances, adjusting the image block crop padding can
improve image block extraction by preventing extracted image blocks from
being clipped.

### Testing
- PDF:
[LM339-D_2-2.pdf](https://github.com/Unstructured-IO/unstructured/files/13968952/LM339-D_2-2.pdf)
- Set two environment variables
`EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD` and
`EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD`
(e.g. `EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD = 40`,
`EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD = 20`

```
elements = partition_pdf(
    filename="LM339-D_2-2.pdf",
    extract_image_block_types=["image"],
)
```
This commit is contained in:
Christine Straub 2024-01-18 22:28:32 -08:00 committed by GitHub
parent 1a305866d1
commit 7378a378f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 70 additions and 22 deletions

View File

@ -1,8 +1,9 @@
## 0.12.1-dev17
## 0.12.1-dev18
### Enhancements
* **Allow setting image block crop padding parameter** In certain circumstances, adjusting the image block crop padding can improve image block extraction by preventing extracted image blocks from being clipped.
* **Add suport for bitmap images in `partition_image`** Adds support for `.bmp` files in
`partition`, `partition_image`, and `detect_filetype`.
* **Keep all image elements when using "hi_res" strategy** Previously, `Image` elements with small chunks of text were ignored unless the image block extraction parameters (`extract_images_in_pdf` or `extract_image_block_types`) were specified. Now, all image elements are kept regardless of whether the image block extraction parameters are specified.

View File

@ -634,7 +634,7 @@ The available strategies for PDFs are ``"auto"``, ``"hi_res"``, ``"ocr_only"``,
* The ``"fast"`` strategy will extract the text using ``pdfminer`` and process the raw text with ``partition_text``. If the PDF text is not extractable, ``partition_pdf`` will fall back to ``"ocr_only"``. We recommend using the ``"fast"`` strategy in most cases where the PDF has extractable text.
To extract images and elements as image blocks from a PDF, it is mandatory to set ``strategy="hi_res"`` when setting ``extract_images_in_pdf=True``. With this configuration, detected images are saved in a specified directory or encoded within the file. However, keep in mind that ``extract_images_in_pdf`` is being phased out in favor of ``extract_image_block_types``. This option allows you to specify types of images or elements, like "Image" or "Table". For integrating these images directly into web applications or APIs, ``extract_image_block_to_payload`` can be used to convert them into ``base64`` format, including details about the image type. Lastly, the ``extract_image_block_output_dir`` can be used to specify the filesystem path for saving the extracted images when not embedding them in payloads.
To extract images and elements as image blocks from a PDF, it is mandatory to set ``strategy="hi_res"`` when setting ``extract_images_in_pdf=True``. With this configuration, detected images are saved in a specified directory or encoded within the file. However, keep in mind that ``extract_images_in_pdf`` is being phased out in favor of ``extract_image_block_types``. This option allows you to specify types of images or elements, like "Image" or "Table". If some extracted images have content clipped, you can adjust the padding by specifying two environment variables "EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD" and "EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD" (for example, EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD = 20, EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD = 10). For integrating these images directly into web applications or APIs, ``extract_image_block_to_payload`` can be used to convert them into ``base64`` format, including details about the image type. Lastly, the ``extract_image_block_output_dir`` can be used to specify the filesystem path for saving the extracted images when not embedding them in payloads.
Examples:

View File

@ -132,3 +132,12 @@ def test_write_image_raises_error():
)
def test_valid_text(text, outcome):
assert pdf_image_utils.valid_text(text) == outcome
def test_pad_bbox():
bbox = (100, 100, 200, 200)
padding = (10, 20) # Horizontal padding 10, Vertical padding 20
expected = (90, 80, 210, 220)
result = pdf_image_utils.pad_bbox(bbox, padding)
assert result == expected

View File

@ -1 +1 @@
__version__ = "0.12.1-dev17" # pragma: no cover
__version__ = "0.12.1-dev18" # pragma: no cover

View File

@ -1,6 +1,5 @@
import os
import tempfile
from copy import deepcopy
from typing import BinaryIO, Dict, List, Optional, Union, cast
import cv2
@ -24,7 +23,7 @@ from unstructured_pytesseract import Output
from unstructured.documents.elements import ElementType
from unstructured.logger import logger
from unstructured.partition.pdf_image.pdf_image_utils import valid_text
from unstructured.partition.pdf_image.pdf_image_utils import pad_element_bboxes, valid_text
from unstructured.partition.utils.config import env_config
from unstructured.partition.utils.constants import (
IMAGE_COLOR_DEPTH,
@ -363,21 +362,6 @@ def get_layout_elements_from_ocr(
return layout_elements
def pad_element_bboxes(
element: "LayoutElement",
padding: Union[int, float],
) -> "LayoutElement":
"""Increases (or decreases, if padding is negative) the size of the bounding
boxes of the element by extending the boundary outward (resp. inward)"""
out_element = deepcopy(element)
out_element.bbox.x1 -= padding
out_element.bbox.x2 += padding
out_element.bbox.y1 -= padding
out_element.bbox.y2 += padding
return out_element
def zoom_image(image: PILImage, zoom: float = 1) -> PILImage:
"""scale an image based on the zoom factor using cv2; the scaled image is post processed by
dilation then erosion to improve edge sharpness for OCR tasks"""

View File

@ -1,9 +1,10 @@
import base64
import os
import tempfile
from copy import deepcopy
from io import BytesIO
from pathlib import PurePath
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast
import cv2
import numpy as np
@ -13,9 +14,11 @@ from PIL import Image
from unstructured.documents.elements import ElementType
from unstructured.logger import logger
from unstructured.partition.common import convert_to_bytes
from unstructured.partition.utils.config import env_config
if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured.documents.elements import Element
@ -75,6 +78,38 @@ def convert_pdf_to_image(
return images
def pad_element_bboxes(
element: "LayoutElement",
padding: Union[int, float],
) -> "LayoutElement":
"""Increases (or decreases, if padding is negative) the size of the bounding
boxes of the element by extending the boundary outward (resp. inward)"""
out_element = deepcopy(element)
out_element.bbox.x1 -= padding
out_element.bbox.x2 += padding
out_element.bbox.y1 -= padding
out_element.bbox.y2 += padding
return out_element
def pad_bbox(
bbox: Tuple[float, float, float, float],
padding: Tuple[Union[int, float], Union[int, float]],
) -> Tuple[float, float, float, float]:
"""Pads a bounding box (bbox) by a specified horizontal and vertical padding."""
x1, y1, x2, y2 = bbox
h_padding, v_padding = padding
x1 -= h_padding
x2 += h_padding
y1 -= v_padding
y2 += v_padding
return x1, y1, x2, y2
def save_elements(
elements: List["Element"],
element_category_to_save: str,
@ -131,6 +166,11 @@ def save_elements(
points = coordinates.points
x1, y1 = points[0]
x2, y2 = points[2]
h_padding = env_config.EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD
v_padding = env_config.EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD
padded_bbox = cast(
Tuple[int, int, int, int], pad_bbox((x1, y1, x2, y2), (h_padding, v_padding))
)
page_number = el.metadata.page_number
figure_number += 1
@ -142,7 +182,7 @@ def save_elements(
)
image_path = image_paths[page_number - 1]
image = Image.open(image_path)
cropped_image = image.crop((x1, y1, x2, y2))
cropped_image = image.crop(padded_bbox)
if extract_image_block_to_payload:
buffered = BytesIO()
cropped_image.save(buffered, format="JPEG")

View File

@ -79,5 +79,19 @@ class ENVConfig:
"""
return self._get_string("OCR_AGENT", OCR_AGENT_TESSERACT)
@property
def EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD(self) -> int:
"""extra image block content to add around an identified element(`Image`, `Table`) region
horizontally; measured in pixels
"""
return self._get_int("EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD", 0)
@property
def EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD(self) -> int:
"""extra image block content to add around an identified element(`Image`, `Table`) region
vertically; measured in pixels
"""
return self._get_int("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", 0)
env_config = ENVConfig()