Fix: missing characters at the beginning of sentences on table ingest output after table OCR refactor (#1961)

Closes #1875. ### Summary - add functionality to do a second OCR on cropped table images - use `IMAGE_CROP_PAD` env for `individual_blocks` mode ### Testing The test function [`test_partition_pdf_hi_res_ocr_mode_with_table_extraction()`](https://github.com/Unstructured-IO/unstructured/blob/main/test_unstructured/partition/pdf_image/test_pdf.py#L425) in `test_pdf.py` should pass. ### NOTE: I've tried to experiment with values for scaling ENVs on the following PRs but found that changes to the values for scaling ENVs affect the entire page OCR output(OCR regression) so switched to doing a second OCR for tables. - https://github.com/Unstructured-IO/unstructured/pull/1998/files - https://github.com/Unstructured-IO/unstructured/pull/2004/files - https://github.com/Unstructured-IO/unstructured/pull/2016/files - https://github.com/Unstructured-IO/unstructured/pull/2029/files --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2025-12-25 06:04:53 +00:00 · 2023-11-09 10:29:55 -08:00 · 2023-11-09 10:29:55 -08:00 · 3fe480799a
commit 3fe480799a
parent bb58c1bb0b
9 changed files with 63 additions and 93 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.10.30-dev4
+## 0.10.30-dev5

 ### Enhancements

@ -7,6 +7,7 @@

 ### Features

+* **Add functionality to do a second OCR on cropped table images.** Changes to the values for scaling ENVs affect entire page OCR output(OCR regression) so we now do a second OCR for tables.
 * **Adds ability to pass timeout for a request when partitioning via a `url`.** `partition` now accepts a new optional parameter `request_timeout` which if set will prevent any `requests.get` from hanging indefinitely and instead will raise a timeout error. This is useful when partitioning a url that may be slow to respond or may not respond at all.

 ### Fixes
--- a/test_unstructured/partition/pdf_image/test_image.py
+++ b/test_unstructured/partition/pdf_image/test_image.py
@ -515,9 +515,8 @@ def test_partition_image_uses_model_name():
 def test_partition_image_hi_res_ocr_mode(ocr_mode, idx_title_element):
    filename = "example-docs/layout-parser-paper-fast.jpg"
    elements = image.partition_image(filename=filename, ocr_mode=ocr_mode, strategy="hi_res")
-    first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
    # Note(yuming): idx_title_element is different based on xy-cut and ocr mode
-    assert elements[idx_title_element].text == first_line
+    assert elements[idx_title_element].category == ElementType.TITLE


 def test_partition_image_hi_res_invalid_ocr_mode():
--- a/test_unstructured/partition/pdf_image/test_ocr.py
+++ b/test_unstructured/partition/pdf_image/test_ocr.py
@ -1,3 +1,5 @@
+from unittest.mock import patch
+
 import numpy as np
 import pandas as pd
 import pytest
@ -437,7 +439,7 @@ def table_element():


@pytest.fixture()
-def ocr_layout():
+def mock_ocr_layout():
    ocr_regions = [
        TextRegion.from_coords(x1=15, y1=25, x2=35, y2=45, text="Token1"),
        TextRegion.from_coords(x1=40, y1=30, x2=45, y2=50, text="Token2"),
@ -445,23 +447,24 @@ def ocr_layout():
    return ocr_regions


-def test_get_table_tokens_per_element(table_element, ocr_layout):
-    table_tokens = ocr.get_table_tokens_per_element(table_element, ocr_layout)
-    expected_tokens = [
-        {
-            "bbox": [5, 5, 25, 25],
-            "text": "Token1",
-            "span_num": 0,
-            "line_num": 0,
-            "block_num": 0,
-        },
-        {
-            "bbox": [30, 10, 35, 30],
-            "text": "Token2",
-            "span_num": 1,
-            "line_num": 0,
-            "block_num": 0,
-        },
-    ]
+def test_get_table_tokens(mock_ocr_layout):
+    with patch.object(ocr, "get_ocr_layout_from_image", return_value=mock_ocr_layout):
+        table_tokens = ocr.get_table_tokens(image=None)
+        expected_tokens = [
+            {
+                "bbox": [15, 25, 35, 45],
+                "text": "Token1",
+                "span_num": 0,
+                "line_num": 0,
+                "block_num": 0,
+            },
+            {
+                "bbox": [40, 30, 45, 50],
+                "text": "Token2",
+                "span_num": 1,
+                "line_num": 0,
+                "block_num": 0,
+            },
+        ]

-    assert table_tokens == expected_tokens
+        assert table_tokens == expected_tokens
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@ -434,8 +434,8 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
    assert len(table) == 2
    assert "<table><thead><th>" in table[0]
    assert "Layouts of history Japanese documents" in table[0]
-    # FIXME(yuming): comment this out since there are some table regression issue
-    # assert "Layouts of scanned modern magazines and scientific reports" in table[0]
+    assert "Layouts of scanned modern magazines and scientific report" in table[0]
+    assert "Layouts of scanned US newspapers from the 20th century" in table[0]


 def test_partition_pdf_with_copy_protection():
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json
@ -47,7 +47,7 @@
      },
      "filetype": "image/jpeg",
      "page_number": 1,
-      "text_as_html": "<table><thead><th>Dataset</th><th>| Base Model\"</th><th>Large Model</th><th>| Notes</th></thead><tr><td>PubLayNet [38]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3)</td><td>M</td><td>-</td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td>-</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>‘TableBank (18)</td><td>P</td><td>P</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset (31)</td><td>| F/M</td><td>-</td><td>Layouts of history Japanese documents</td></tr></table>"
+      "text_as_html": "<table><thead><th>Dataset</th><th>| Base Model!|</th><th>Large Model</th><th>| Notes</th></thead><tr><td>PubLayNet [33]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3]</td><td>M</td><td></td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td></td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank [18]</td><td>P</td><td></td><td>Table region on modern scientific and business document</td></tr><tr><td>HIDataset [31]</td><td>P/M</td><td></td><td>Layouts of history Japanese documents</td></tr></table>"
    },
    "text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century ‘TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents"
  },
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
@ -659,7 +659,7 @@
      },
      "filetype": "application/pdf",
      "page_number": 5,
-      "text_as_html": "<table><thead><th>Dataset</th><th>| Base Model'|</th><th>Notes</th></thead><tr><td>PubLayNet B8]|</td><td>F/M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA</td><td>M</td><td>nned modern magazines and scientific reports</td></tr><tr><td>Newspapei</td><td>F</td><td>canned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset</td><td>F/M</td><td>Layouts of history Japanese documents</td></tr></table>"
+      "text_as_html": "<table><thead><th>Dataset</th><th>| Base Model'|</th><th>| Notes</th></thead><tr><td>PubLayNet B8]|</td><td>F/M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA</td><td>M</td><td>Layouts of scanned modern magazines and scientific report</td></tr><tr><td>Newspaper</td><td>F</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset</td><td>F/M</td><td>Layouts of history Japanese documents</td></tr></table>"
    },
    "text": "Base Model1 Large Model Notes Dataset PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scientiﬁc documents Layouts of scanned modern magazines and scientiﬁc reports Layouts of scanned US newspapers from the 20th century Table region on modern scientiﬁc and business document Layouts of history Japanese documents"
  },
@ -1085,7 +1085,7 @@
      },
      "filetype": "application/pdf",
      "page_number": 8,
-      "text_as_html": "<table><thead><th>block.pad(top, bottom,</th><th>right,</th><th>left)</th><th>Enlarge the current block according to the input</th></thead><tr><td>block.scale(fx, fy)</td><td></td><td></td><td>Scale the current block given the ratio ion in x and y di</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>block1.is_in(block2)</td><td></td><td></td><td>Whether block] is inside of block2</td></tr><tr><td>; block1. intersect (block2)</td><td></td><td></td><td>Return the intersection region of block and block2. . . . Coordinate type to be determined based on the inputs.</td></tr><tr><td>; block1.union(block2)</td><td></td><td></td><td>Return the union region of block1 and block2. . . . Coordinate type to be determined based on the inputs.</td></tr><tr><td>block1.relative_to(block2)</td><td></td><td></td><td>Convert the absolute coordinates of block to ' ' relative coordinates to block2</td></tr><tr><td>. block1.condition_on(block2)</td><td></td><td></td><td>Calculate the absolute coordinates of block1 given . the canvas block2’s absolute coordinates</td></tr><tr><td>block. crop_image (image)</td><td></td><td></td><td>Obtain the image segments in the block region</td></tr></table>"
+      "text_as_html": "<table><thead><th>block.pad(top, bottom,</th><th>right,</th><th>left)</th><th>Enlarge the current block according to the input</th></thead><tr><td>block.scale(fx, fy)</td><td></td><td></td><td>Scale the current block given the ratio in x and y direction</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>block1.is_in(block2)</td><td></td><td></td><td>Whether block] is inside of block2</td></tr><tr><td>block1. intersect (block2)</td><td></td><td></td><td>Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.union(block2)</td><td></td><td></td><td>Return the union region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.relative_to(block2)</td><td></td><td></td><td>Convert the absolute coordinates of block to relative coordinates to block2</td></tr><tr><td>block1.condition_on(block2)</td><td></td><td></td><td>Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates</td></tr><tr><td>block. crop_image (image)</td><td></td><td></td><td>Obtain the image segments in the block region</td></tr></table>"
    },
    "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input Scale the current block given the ratio in x and y direction block.scale(fx, fy) Move the current block with the shift distances in x and y direction block.shift(dx, dy) Whether block1 is inside of block2 block1.is in(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.intersect(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.relative to(block2) Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates block1.condition on(block2) Obtain the image segments in the block region block.crop image(image)"
  },
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.10.30-dev4"  # pragma: no cover
+__version__ = "0.10.30-dev5"  # pragma: no cover
--- a/unstructured/partition/ocr.py
+++ b/unstructured/partition/ocr.py
@ -189,7 +189,6 @@ def supplement_page_layout_with_ocr(
    """

    ocr_agent = get_ocr_agent()
-    ocr_layout = None
    if ocr_mode == OCRMode.FULL_PAGE.value:
        ocr_layout = get_ocr_layout_from_image(
            image,
@ -203,7 +202,8 @@ def supplement_page_layout_with_ocr(
    elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:
        for element in page_layout.elements:
            if element.text == "":
-                padded_element = pad_element_bboxes(element, padding=12)
+                padding = env_config.IMAGE_CROP_PAD
+                padded_element = pad_element_bboxes(element, padding=padding)
                cropped_image = image.crop(
                    (
                        padded_element.bbox.x1,
@ -229,18 +229,12 @@ def supplement_page_layout_with_ocr(
    # Note(yuming): use the OCR data from entire page OCR for table extraction
    if infer_table_structure:
        table_agent = init_table_agent()
-        if ocr_layout is None:
-            # Note(yuming): ocr_layout is None for individual_blocks ocr_mode
-            ocr_layout = get_ocr_layout_from_image(
-                image,
-                ocr_languages=ocr_languages,
-                ocr_agent=ocr_agent,
-            )
        page_layout.elements[:] = supplement_element_with_table_extraction(
            elements=cast(List[LayoutElement], page_layout.elements),
-            ocr_layout=ocr_layout,
            image=image,
            table_agent=table_agent,
+            ocr_languages=ocr_languages,
+            ocr_agent=ocr_agent,
        )

    return page_layout
@ -248,9 +242,10 @@ def supplement_page_layout_with_ocr(

 def supplement_element_with_table_extraction(
    elements: List[LayoutElement],
-    ocr_layout: List[TextRegion],
    image: PILImage,
    table_agent: "UnstructuredTableTransformerModel",
+    ocr_languages: str = "eng",
+    ocr_agent: str = OCR_AGENT_TESSERACT,
 ) -> List[LayoutElement]:
    """Supplement the existing layout with table extraction. Any Table elements
    that are extracted will have a metadata field "text_as_html" where
@ -268,59 +263,38 @@ def supplement_element_with_table_extraction(
                    padded_element.bbox.y2,
                ),
            )
-            table_tokens = get_table_tokens_per_element(
-                padded_element,
-                ocr_layout,
+            table_tokens = get_table_tokens(
+                image=cropped_image, ocr_languages=ocr_languages, ocr_agent=ocr_agent
            )
            element.text_as_html = table_agent.predict(cropped_image, ocr_tokens=table_tokens)
    return elements


-def get_table_tokens_per_element(
-    table_element: LayoutElement,
-    ocr_layout: List[TextRegion],
+def get_table_tokens(
+    image: PILImage,
+    ocr_languages: str = "eng",
+    ocr_agent: str = OCR_AGENT_TESSERACT,
 ) -> List[Dict]:
-    """
-    Extract and prepare table tokens within the specified table element
-    based on the OCR layout of an entire image.
+    """Get OCR tokens from either paddleocr or tesseract"""

-    Parameters:
-    - table_element (LayoutElement): The table element for which table tokens
-      should be extracted. It typically represents the bounding box of the table.
-    - ocr_layout (List[TextRegion]): A list of TextRegion objects representing
-      the OCR layout of the entire image.
-
-    Returns:
-    - List[Dict]: A list of dictionaries, each containing information about a table
-      token within the specified table element. Each dictionary includes the
-      following fields:
-        - 'bbox': A list of four coordinates [x1, y1, x2, y2]
-                    relative to the table element's bounding box.
-        - 'text': The text content of the table token.
-        - 'span_num': (Optional) The span number of the table token.
-        - 'line_num': (Optional) The line number of the table token.
-        - 'block_num': (Optional) The block number of the table token.
-    """
-    # TODO(yuming): update table_tokens from List[Dict] to List[TABLE_TOKEN]
-    # where TABLE_TOKEN will be a data class defined in unstructured-inference
+    ocr_layout = get_ocr_layout_from_image(
+        image,
+        ocr_languages=ocr_languages,
+        ocr_agent=ocr_agent,
+    )
    table_tokens = []
    for ocr_region in ocr_layout:
-        if ocr_region.bbox.is_in(
-            table_element.bbox,
-            error_margin=env_config.TABLE_TOKEN_ERROR_MARGIN,
-        ):
-            table_tokens.append(
-                {
-                    "bbox": [
-                        # token bound box is relative to table element
-                        ocr_region.bbox.x1 - table_element.bbox.x1,
-                        ocr_region.bbox.y1 - table_element.bbox.y1,
-                        ocr_region.bbox.x2 - table_element.bbox.x1,
-                        ocr_region.bbox.y2 - table_element.bbox.y1,
-                    ],
-                    "text": ocr_region.text,
-                },
-            )
+        table_tokens.append(
+            {
+                "bbox": [
+                    ocr_region.bbox.x1,
+                    ocr_region.bbox.y1,
+                    ocr_region.bbox.x2,
+                    ocr_region.bbox.y2,
+                ],
+                "text": ocr_region.text,
+            }
+        )

    # 'table_tokens' is a list of tokens
    # Need to be in a relative reading order
@ -496,7 +470,7 @@ def get_ocr_layout_tesseract(
        text_height < env_config.TESSERACT_MIN_TEXT_HEIGHT
        or text_height > env_config.TESSERACT_MAX_TEXT_HEIGHT
    ):
-        # rounding avoids unnecessary precision and potential numerical issues assocaited
+        # rounding avoids unnecessary precision and potential numerical issues associated
        # with numbers very close to 1 inside cv2 image processing
        zoom = np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1)
        ocr_df = unstructured_pytesseract.image_to_data(
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@ -72,16 +72,9 @@ class ENVConfig:
        """optimum text height for tesseract OCR"""
        return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)

-    @property
-    def TABLE_TOKEN_ERROR_MARGIN(self) -> float:
-        """error margin when comparing if a ocr region is within the table element when perparing
-        table tokens
-        """
-        return self._get_float("TABLE_TOKEN_ERROR_MARGIN", 0.0)
-
    @property
    def OCR_AGENT(self) -> str:
-        """error margin when comparing if a ocr region is within the table element when perparing
+        """error margin when comparing if a ocr region is within the table element when preparing
        table tokens
        """
        return self._get_string("OCR_AGENT", OCR_AGENT_TESSERACT)