Extract coordinates from PDFs and images when using OCR only strategy (#1163)

### Summary Closes #983 Creates new function `add_pytesseract_bbox_to_elements` Fixes typos in docstrings ### Testing ``` from unstructured.partition.image import partition_image from PIL import Image, ImageDraw png_filename="example-docs/english-and-korean.png" png_elements = partition_image(filename=png_filename, strategy="ocr_only") png_image = Image.open(png_filename) draw = ImageDraw.Draw(png_image) draw.polygon(png_elements[0].metadata.coordinates.points, outline="red", width=2) draw.polygon(png_elements[1].metadata.coordinates.points, outline="red", width=2) draw.polygon(png_elements[2].metadata.coordinates.points, outline="red", width=2) output = "example-docs/english-and-korean-box.png" png_image.save(output) png_image.close() ```
2025-11-03 11:34:07 +00:00 · 2023-08-25 00:32:12 -05:00 · 2023-08-25 00:32:12 -05:00 · 5872fa23c3
commit 5872fa23c3
parent c578b85699
5 changed files with 121 additions and 9 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,8 @@

 ### Features

+* Extract coordinates from PDFs and images when using OCR only strategy and add to metadata
+
 ### Fixes

 * Update `partition_html` to respect the order of `<pre>` tags.
--- a/test_unstructured/partition/pdf-image/test_image.py
+++ b/test_unstructured/partition/pdf-image/test_image.py
@ -7,7 +7,6 @@ from PIL import Image
 from pytesseract import TesseractError
 from unstructured_inference.inference import layout

-from unstructured.documents.elements import Title
 from unstructured.partition import image, pdf

 DIRECTORY = pathlib.Path(__file__).parent.resolve()
@ -194,7 +193,7 @@ def test_partition_image_with_ocr_detects_korean():
        strategy="ocr_only",
    )

-    assert elements[0] == Title("RULES AND INSTRUCTIONS")
+    assert elements[0].text == "RULES AND INSTRUCTIONS"
    assert elements[3].text.replace(" ", "").startswith("안녕하세요")


@ -207,7 +206,7 @@ def test_partition_image_with_ocr_detects_korean_from_file():
            strategy="ocr_only",
        )

-    assert elements[0] == Title("RULES AND INSTRUCTIONS")
+    assert elements[0].text == "RULES AND INSTRUCTIONS"
    assert elements[3].text.replace(" ", "").startswith("안녕하세요")


@ -378,3 +377,17 @@ def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_met
        )

    assert elements[0].metadata.last_modified == expected_last_modification_date
+
+
+def test_partition_image_with_ocr_has_coordinates_from_file(
+    mocker,
+    filename="example-docs/english-and-korean.png",
+):
+    mocked_last_modification_date = "2029-07-05T09:24:28"
+    mocker.patch(
+        "unstructured.partition.pdf.get_last_modified_date",
+        return_value=mocked_last_modification_date,
+    )
+    elements = image.partition_image(filename=filename, strategy="ocr_only")
+    int_coordinates = [(int(x), int(y)) for x, y in elements[0].metadata.coordinates.points]
+    assert int_coordinates == [(14, 36), (14, 16), (381, 16), (381, 36)]
--- a/test_unstructured/partition/pdf-image/test_pdf.py
+++ b/test_unstructured/partition/pdf-image/test_pdf.py
@ -768,3 +768,31 @@ def test_partition_pdf_from_file_with_hi_res_strategy_custom_metadata_date(
        )

    assert elements[0].metadata.last_modified == expected_last_modification_date
+
+
+def test_partition_pdf_with_ocr_has_coordinates_from_filename(
+    filename="example-docs/chevron-page.pdf",
+):
+    elements = pdf.partition_pdf(filename=filename, strategy="ocr_only")
+    assert elements[0].metadata.coordinates.points == [
+        (657.0, 2144.0),
+        (657.0, 2106.0),
+        (1043.0, 2106.0),
+        (1043.0, 2144.0),
+    ]
+
+
+def test_partition_pdf_with_ocr_has_coordinates_from_file(
+    filename="example-docs/chevron-page.pdf",
+):
+    with open(filename, "rb") as f:
+        elements = pdf.partition_pdf(
+            file=f,
+            strategy="ocr_only",
+        )
+    assert elements[0].metadata.coordinates.points == [
+        (657.0, 2144.0),
+        (657.0, 2106.0),
+        (1043.0, 2106.0),
+        (1043.0, 2144.0),
+    ]
--- a/unstructured/documents/coordinates.py
+++ b/unstructured/documents/coordinates.py
@ -80,15 +80,15 @@ class RelativeCoordinateSystem(CoordinateSystem):


 class PixelSpace(CoordinateSystem):
-    """Coordinate system representing a pixel space, such as an image. The origin is at the bottom
-    right."""
+    """Coordinate system representing a pixel space, such as an image. The origin is at the top
+    left."""

    orientation = Orientation.SCREEN


 class PointSpace(CoordinateSystem):
-    """Coordinate system representing a point space, such as a pdf. The origin is at the top
-    right."""
+    """Coordinate system representing a point space, such as a pdf. The origin is at the bottom
+    left."""

    orientation = Orientation.CARTESIAN

--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -11,7 +11,7 @@ from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
 from pdfminer.utils import open_filename

 from unstructured.cleaners.core import clean_extra_whitespace
-from unstructured.documents.coordinates import PixelSpace
+from unstructured.documents.coordinates import PixelSpace, PointSpace
 from unstructured.documents.elements import (
    CoordinatesMetadata,
    Element,
@ -460,6 +460,66 @@ def convert_pdf_to_images(
            yield image


+def add_pytesseract_bbox_to_elements(elements, bboxes, width, height):
+    """
+    Get the bounding box of each element and add it to element.metadata.coordinates
+
+    Args:
+        elements: elements containing text detected by pytesseract.image_to_string.
+        bboxes (str): The return value of pytesseract.image_to_boxes.
+    """
+    # (NOTE) jennings: This function was written with pytesseract in mind, but
+    # paddle returns similar values via `ocr.ocr(img)`.
+    # See more at issue #1176: https://github.com/Unstructured-IO/unstructured/issues/1176
+    min_x = float("inf")
+    min_y = float("inf")
+    max_x = 0
+    max_y = 0
+    point_space = PointSpace(
+        width=width,
+        height=height,
+    )
+    pixel_space = PixelSpace(
+        width=width,
+        height=height,
+    )
+
+    boxes = bboxes.strip().split("\n")
+    i = 0
+    for element in elements:
+        char_count = len(element.text.replace(" ", ""))
+
+        for box in boxes[i : i + char_count]:  # noqa
+            _, x1, y1, x2, y2, _ = box.split()
+            x1, y1, x2, y2 = map(int, [x1, y1, x2, y2])
+
+            min_x = min(min_x, x1)
+            min_y = min(min_y, y1)
+            max_x = max(max_x, x2)
+            max_y = max(max_y, y2)
+
+        points = ((min_x, min_y), (min_x, max_y), (max_x, max_y), (max_x, min_y))
+        converted_points = []
+        for point in points:
+            x, y = point
+            new_x, new_y = point_space.convert_coordinates_to_new_system(pixel_space, x, y)
+            converted_points.append((new_x, new_y))
+
+        element.metadata.coordinates = CoordinatesMetadata(
+            points=converted_points,
+            system=pixel_space,
+        )
+
+        # reset for next element
+        min_x = float("inf")
+        min_y = float("inf")
+        max_x = 0
+        max_y = 0
+        i += char_count
+
+    return elements
+
+
@requires_dependencies("pytesseract")
 def _partition_pdf_or_image_with_ocr(
    filename: str = "",
@ -471,7 +531,7 @@ def _partition_pdf_or_image_with_ocr(
    min_partition: Optional[int] = 0,
    metadata_last_modified: Optional[str] = None,
 ):
-    """Partitions and image or PDF using Tesseract OCR. For PDFs, each page is converted
+    """Partitions an image or PDF using Tesseract OCR. For PDFs, each page is converted
    to an image prior to processing."""
    import pytesseract

@ -479,14 +539,19 @@ def _partition_pdf_or_image_with_ocr(
        if file is not None:
            image = PIL.Image.open(file)
            text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
+            bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'")
        else:
+            image = PIL.Image.open(filename)
            text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
+            bboxes = pytesseract.image_to_boxes(filename, config=f"-l '{ocr_languages}'")
        elements = partition_text(
            text=text,
            max_partition=max_partition,
            min_partition=min_partition,
            metadata_last_modified=metadata_last_modified,
        )
+        width, height = image.size
+        add_pytesseract_bbox_to_elements(elements, bboxes, width, height)

    else:
        elements = []
@ -499,6 +564,8 @@ def _partition_pdf_or_image_with_ocr(
                last_modified=metadata_last_modified,
            )
            text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
+            bboxes = pytesseract.image_to_boxes(image, config=f"-l '{ocr_languages}'")
+            width, height = image.size

            _elements = partition_text(
                text=text,
@ -509,6 +576,8 @@ def _partition_pdf_or_image_with_ocr(
                element.metadata = metadata
                elements.append(element)

+            add_pytesseract_bbox_to_elements(elements, bboxes, width, height)
+
            if include_page_breaks:
                elements.append(PageBreak(text=""))
    return elements