mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-04 19:16:03 +00:00
enhancement: allow setting image block crop padding parameter (#2415)
Closes #2320 . ### Summary In certain circumstances, adjusting the image block crop padding can improve image block extraction by preventing extracted image blocks from being clipped. ### Testing - PDF: [LM339-D_2-2.pdf](https://github.com/Unstructured-IO/unstructured/files/13968952/LM339-D_2-2.pdf) - Set two environment variables `EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD` and `EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD` (e.g. `EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD = 40`, `EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD = 20` ``` elements = partition_pdf( filename="LM339-D_2-2.pdf", extract_image_block_types=["image"], ) ```
This commit is contained in:
parent
1a305866d1
commit
7378a378f6
@ -1,8 +1,9 @@
|
||||
## 0.12.1-dev17
|
||||
## 0.12.1-dev18
|
||||
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Allow setting image block crop padding parameter** In certain circumstances, adjusting the image block crop padding can improve image block extraction by preventing extracted image blocks from being clipped.
|
||||
* **Add suport for bitmap images in `partition_image`** Adds support for `.bmp` files in
|
||||
`partition`, `partition_image`, and `detect_filetype`.
|
||||
* **Keep all image elements when using "hi_res" strategy** Previously, `Image` elements with small chunks of text were ignored unless the image block extraction parameters (`extract_images_in_pdf` or `extract_image_block_types`) were specified. Now, all image elements are kept regardless of whether the image block extraction parameters are specified.
|
||||
|
||||
@ -634,7 +634,7 @@ The available strategies for PDFs are ``"auto"``, ``"hi_res"``, ``"ocr_only"``,
|
||||
|
||||
* The ``"fast"`` strategy will extract the text using ``pdfminer`` and process the raw text with ``partition_text``. If the PDF text is not extractable, ``partition_pdf`` will fall back to ``"ocr_only"``. We recommend using the ``"fast"`` strategy in most cases where the PDF has extractable text.
|
||||
|
||||
To extract images and elements as image blocks from a PDF, it is mandatory to set ``strategy="hi_res"`` when setting ``extract_images_in_pdf=True``. With this configuration, detected images are saved in a specified directory or encoded within the file. However, keep in mind that ``extract_images_in_pdf`` is being phased out in favor of ``extract_image_block_types``. This option allows you to specify types of images or elements, like "Image" or "Table". For integrating these images directly into web applications or APIs, ``extract_image_block_to_payload`` can be used to convert them into ``base64`` format, including details about the image type. Lastly, the ``extract_image_block_output_dir`` can be used to specify the filesystem path for saving the extracted images when not embedding them in payloads.
|
||||
To extract images and elements as image blocks from a PDF, it is mandatory to set ``strategy="hi_res"`` when setting ``extract_images_in_pdf=True``. With this configuration, detected images are saved in a specified directory or encoded within the file. However, keep in mind that ``extract_images_in_pdf`` is being phased out in favor of ``extract_image_block_types``. This option allows you to specify types of images or elements, like "Image" or "Table". If some extracted images have content clipped, you can adjust the padding by specifying two environment variables "EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD" and "EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD" (for example, EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD = 20, EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD = 10). For integrating these images directly into web applications or APIs, ``extract_image_block_to_payload`` can be used to convert them into ``base64`` format, including details about the image type. Lastly, the ``extract_image_block_output_dir`` can be used to specify the filesystem path for saving the extracted images when not embedding them in payloads.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@ -132,3 +132,12 @@ def test_write_image_raises_error():
|
||||
)
|
||||
def test_valid_text(text, outcome):
|
||||
assert pdf_image_utils.valid_text(text) == outcome
|
||||
|
||||
|
||||
def test_pad_bbox():
|
||||
bbox = (100, 100, 200, 200)
|
||||
padding = (10, 20) # Horizontal padding 10, Vertical padding 20
|
||||
expected = (90, 80, 210, 220)
|
||||
|
||||
result = pdf_image_utils.pad_bbox(bbox, padding)
|
||||
assert result == expected
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.12.1-dev17" # pragma: no cover
|
||||
__version__ = "0.12.1-dev18" # pragma: no cover
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
import os
|
||||
import tempfile
|
||||
from copy import deepcopy
|
||||
from typing import BinaryIO, Dict, List, Optional, Union, cast
|
||||
|
||||
import cv2
|
||||
@ -24,7 +23,7 @@ from unstructured_pytesseract import Output
|
||||
|
||||
from unstructured.documents.elements import ElementType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.pdf_image.pdf_image_utils import valid_text
|
||||
from unstructured.partition.pdf_image.pdf_image_utils import pad_element_bboxes, valid_text
|
||||
from unstructured.partition.utils.config import env_config
|
||||
from unstructured.partition.utils.constants import (
|
||||
IMAGE_COLOR_DEPTH,
|
||||
@ -363,21 +362,6 @@ def get_layout_elements_from_ocr(
|
||||
return layout_elements
|
||||
|
||||
|
||||
def pad_element_bboxes(
|
||||
element: "LayoutElement",
|
||||
padding: Union[int, float],
|
||||
) -> "LayoutElement":
|
||||
"""Increases (or decreases, if padding is negative) the size of the bounding
|
||||
boxes of the element by extending the boundary outward (resp. inward)"""
|
||||
|
||||
out_element = deepcopy(element)
|
||||
out_element.bbox.x1 -= padding
|
||||
out_element.bbox.x2 += padding
|
||||
out_element.bbox.y1 -= padding
|
||||
out_element.bbox.y2 += padding
|
||||
return out_element
|
||||
|
||||
|
||||
def zoom_image(image: PILImage, zoom: float = 1) -> PILImage:
|
||||
"""scale an image based on the zoom factor using cv2; the scaled image is post processed by
|
||||
dilation then erosion to improve edge sharpness for OCR tasks"""
|
||||
|
||||
@ -1,9 +1,10 @@
|
||||
import base64
|
||||
import os
|
||||
import tempfile
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from pathlib import PurePath
|
||||
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
|
||||
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
@ -13,9 +14,11 @@ from PIL import Image
|
||||
from unstructured.documents.elements import ElementType
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.common import convert_to_bytes
|
||||
from unstructured.partition.utils.config import env_config
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from unstructured_inference.inference.layout import DocumentLayout, PageLayout, TextRegion
|
||||
from unstructured_inference.inference.layoutelement import LayoutElement
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
|
||||
@ -75,6 +78,38 @@ def convert_pdf_to_image(
|
||||
return images
|
||||
|
||||
|
||||
def pad_element_bboxes(
|
||||
element: "LayoutElement",
|
||||
padding: Union[int, float],
|
||||
) -> "LayoutElement":
|
||||
"""Increases (or decreases, if padding is negative) the size of the bounding
|
||||
boxes of the element by extending the boundary outward (resp. inward)"""
|
||||
|
||||
out_element = deepcopy(element)
|
||||
out_element.bbox.x1 -= padding
|
||||
out_element.bbox.x2 += padding
|
||||
out_element.bbox.y1 -= padding
|
||||
out_element.bbox.y2 += padding
|
||||
|
||||
return out_element
|
||||
|
||||
|
||||
def pad_bbox(
|
||||
bbox: Tuple[float, float, float, float],
|
||||
padding: Tuple[Union[int, float], Union[int, float]],
|
||||
) -> Tuple[float, float, float, float]:
|
||||
"""Pads a bounding box (bbox) by a specified horizontal and vertical padding."""
|
||||
|
||||
x1, y1, x2, y2 = bbox
|
||||
h_padding, v_padding = padding
|
||||
x1 -= h_padding
|
||||
x2 += h_padding
|
||||
y1 -= v_padding
|
||||
y2 += v_padding
|
||||
|
||||
return x1, y1, x2, y2
|
||||
|
||||
|
||||
def save_elements(
|
||||
elements: List["Element"],
|
||||
element_category_to_save: str,
|
||||
@ -131,6 +166,11 @@ def save_elements(
|
||||
points = coordinates.points
|
||||
x1, y1 = points[0]
|
||||
x2, y2 = points[2]
|
||||
h_padding = env_config.EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD
|
||||
v_padding = env_config.EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD
|
||||
padded_bbox = cast(
|
||||
Tuple[int, int, int, int], pad_bbox((x1, y1, x2, y2), (h_padding, v_padding))
|
||||
)
|
||||
page_number = el.metadata.page_number
|
||||
|
||||
figure_number += 1
|
||||
@ -142,7 +182,7 @@ def save_elements(
|
||||
)
|
||||
image_path = image_paths[page_number - 1]
|
||||
image = Image.open(image_path)
|
||||
cropped_image = image.crop((x1, y1, x2, y2))
|
||||
cropped_image = image.crop(padded_bbox)
|
||||
if extract_image_block_to_payload:
|
||||
buffered = BytesIO()
|
||||
cropped_image.save(buffered, format="JPEG")
|
||||
|
||||
@ -79,5 +79,19 @@ class ENVConfig:
|
||||
"""
|
||||
return self._get_string("OCR_AGENT", OCR_AGENT_TESSERACT)
|
||||
|
||||
@property
|
||||
def EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD(self) -> int:
|
||||
"""extra image block content to add around an identified element(`Image`, `Table`) region
|
||||
horizontally; measured in pixels
|
||||
"""
|
||||
return self._get_int("EXTRACT_IMAGE_BLOCK_CROP_HORIZONTAL_PAD", 0)
|
||||
|
||||
@property
|
||||
def EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD(self) -> int:
|
||||
"""extra image block content to add around an identified element(`Image`, `Table`) region
|
||||
vertically; measured in pixels
|
||||
"""
|
||||
return self._get_int("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", 0)
|
||||
|
||||
|
||||
env_config = ENVConfig()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user