From 3fe480799ac577d0b15fea4d33d6409258c4e136 Mon Sep 17 00:00:00 2001
From: Christine Straub <christinemstraub@gmail.com>
Date: Thu, 9 Nov 2023 10:29:55 -0800
Subject: [PATCH] Fix: missing characters at the beginning of sentences on
 table ingest output after table OCR refactor (#1961)

Closes #1875.

### Summary
- add functionality to do a second OCR on cropped table images
- use `IMAGE_CROP_PAD` env for `individual_blocks` mode
### Testing
The test function
[`test_partition_pdf_hi_res_ocr_mode_with_table_extraction()`](https://github.com/Unstructured-IO/unstructured/blob/main/test_unstructured/partition/pdf_image/test_pdf.py#L425)
in `test_pdf.py` should pass.

### NOTE:
I've tried to experiment with values for scaling ENVs on the following
PRs but found that changes to the values for scaling ENVs affect the
entire page OCR output(OCR regression) so switched to doing a second OCR
for tables.
- https://github.com/Unstructured-IO/unstructured/pull/1998/files
- https://github.com/Unstructured-IO/unstructured/pull/2004/files
- https://github.com/Unstructured-IO/unstructured/pull/2016/files
- https://github.com/Unstructured-IO/unstructured/pull/2029/files

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
---
 CHANGELOG.md                                  |  3 +-
 .../partition/pdf_image/test_image.py         |  3 +-
 .../partition/pdf_image/test_ocr.py           | 43 +++++-----
 .../partition/pdf_image/test_pdf.py           |  4 +-
 .../layout-parser-paper-with-table.jpg.json   |  2 +-
 .../layout-parser-paper.pdf.json              |  4 +-
 unstructured/__version__.py                   |  2 +-
 unstructured/partition/ocr.py                 | 86 +++++++------------
 unstructured/partition/utils/config.py        |  9 +-
 9 files changed, 63 insertions(+), 93 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4014131f4..da504fa79 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.10.30-dev4
+## 0.10.30-dev5
 
 ### Enhancements
 
@@ -7,6 +7,7 @@
 
 ### Features
 
+* **Add functionality to do a second OCR on cropped table images.** Changes to the values for scaling ENVs affect entire page OCR output(OCR regression) so we now do a second OCR for tables.
 * **Adds ability to pass timeout for a request when partitioning via a `url`.** `partition` now accepts a new optional parameter `request_timeout` which if set will prevent any `requests.get` from hanging indefinitely and instead will raise a timeout error. This is useful when partitioning a url that may be slow to respond or may not respond at all.
 
 ### Fixes
diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py
index 46f528bac..86a0a7a77 100644
--- a/test_unstructured/partition/pdf_image/test_image.py
+++ b/test_unstructured/partition/pdf_image/test_image.py
@@ -515,9 +515,8 @@ def test_partition_image_uses_model_name():
 def test_partition_image_hi_res_ocr_mode(ocr_mode, idx_title_element):
     filename = "example-docs/layout-parser-paper-fast.jpg"
     elements = image.partition_image(filename=filename, ocr_mode=ocr_mode, strategy="hi_res")
-    first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
     # Note(yuming): idx_title_element is different based on xy-cut and ocr mode
-    assert elements[idx_title_element].text == first_line
+    assert elements[idx_title_element].category == ElementType.TITLE
 
 
 def test_partition_image_hi_res_invalid_ocr_mode():
diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
index cc2637fb7..4d94cd610 100644
--- a/test_unstructured/partition/pdf_image/test_ocr.py
+++ b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -1,3 +1,5 @@
+from unittest.mock import patch
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -437,7 +439,7 @@ def table_element():
 
 
 @pytest.fixture()
-def ocr_layout():
+def mock_ocr_layout():
     ocr_regions = [
         TextRegion.from_coords(x1=15, y1=25, x2=35, y2=45, text="Token1"),
         TextRegion.from_coords(x1=40, y1=30, x2=45, y2=50, text="Token2"),
@@ -445,23 +447,24 @@ def ocr_layout():
     return ocr_regions
 
 
-def test_get_table_tokens_per_element(table_element, ocr_layout):
-    table_tokens = ocr.get_table_tokens_per_element(table_element, ocr_layout)
-    expected_tokens = [
-        {
-            "bbox": [5, 5, 25, 25],
-            "text": "Token1",
-            "span_num": 0,
-            "line_num": 0,
-            "block_num": 0,
-        },
-        {
-            "bbox": [30, 10, 35, 30],
-            "text": "Token2",
-            "span_num": 1,
-            "line_num": 0,
-            "block_num": 0,
-        },
-    ]
+def test_get_table_tokens(mock_ocr_layout):
+    with patch.object(ocr, "get_ocr_layout_from_image", return_value=mock_ocr_layout):
+        table_tokens = ocr.get_table_tokens(image=None)
+        expected_tokens = [
+            {
+                "bbox": [15, 25, 35, 45],
+                "text": "Token1",
+                "span_num": 0,
+                "line_num": 0,
+                "block_num": 0,
+            },
+            {
+                "bbox": [40, 30, 45, 50],
+                "text": "Token2",
+                "span_num": 1,
+                "line_num": 0,
+                "block_num": 0,
+            },
+        ]
 
-    assert table_tokens == expected_tokens
+        assert table_tokens == expected_tokens
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
index 7b5babda3..8273986c2 100644
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -434,8 +434,8 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
     assert len(table) == 2
     assert "<table><thead><th>" in table[0]
     assert "Layouts of history Japanese documents" in table[0]
-    # FIXME(yuming): comment this out since there are some table regression issue
-    # assert "Layouts of scanned modern magazines and scientific reports" in table[0]
+    assert "Layouts of scanned modern magazines and scientific report" in table[0]
+    assert "Layouts of scanned US newspapers from the 20th century" in table[0]
 
 
 def test_partition_pdf_with_copy_protection():
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json
index ededa714b..39d88751a 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json
@@ -47,7 +47,7 @@
       },
       "filetype": "image/jpeg",
       "page_number": 1,
-      "text_as_html": "<table><thead><th>Dataset</th><th>| Base Model\"</th><th>Large Model</th><th>| Notes</th></thead><tr><td>PubLayNet [38]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3)</td><td>M</td><td>-</td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td>-</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>‘TableBank (18)</td><td>P</td><td>P</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset (31)</td><td>| F/M</td><td>-</td><td>Layouts of history Japanese documents</td></tr></table>"
+      "text_as_html": "<table><thead><th>Dataset</th><th>| Base Model!|</th><th>Large Model</th><th>| Notes</th></thead><tr><td>PubLayNet [33]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3]</td><td>M</td><td></td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td></td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank [18]</td><td>P</td><td></td><td>Table region on modern scientific and business document</td></tr><tr><td>HIDataset [31]</td><td>P/M</td><td></td><td>Layouts of history Japanese documents</td></tr></table>"
     },
     "text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century ‘TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents"
   },
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
index 3bec9e860..1851b878b 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
@@ -659,7 +659,7 @@
       },
       "filetype": "application/pdf",
       "page_number": 5,
-      "text_as_html": "<table><thead><th>Dataset</th><th>| Base Model'|</th><th>Notes</th></thead><tr><td>PubLayNet B8]|</td><td>F/M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA</td><td>M</td><td>nned modern magazines and scientific reports</td></tr><tr><td>Newspapei</td><td>F</td><td>canned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset</td><td>F/M</td><td>Layouts of history Japanese documents</td></tr></table>"
+      "text_as_html": "<table><thead><th>Dataset</th><th>| Base Model'|</th><th>| Notes</th></thead><tr><td>PubLayNet B8]|</td><td>F/M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA</td><td>M</td><td>Layouts of scanned modern magazines and scientific report</td></tr><tr><td>Newspaper</td><td>F</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset</td><td>F/M</td><td>Layouts of history Japanese documents</td></tr></table>"
     },
     "text": "Base Model1 Large Model Notes Dataset PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scientiﬁc documents Layouts of scanned modern magazines and scientiﬁc reports Layouts of scanned US newspapers from the 20th century Table region on modern scientiﬁc and business document Layouts of history Japanese documents"
   },
@@ -1085,7 +1085,7 @@
       },
       "filetype": "application/pdf",
       "page_number": 8,
-      "text_as_html": "<table><thead><th>block.pad(top, bottom,</th><th>right,</th><th>left)</th><th>Enlarge the current block according to the input</th></thead><tr><td>block.scale(fx, fy)</td><td></td><td></td><td>Scale the current block given the ratio ion in x and y di</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>block1.is_in(block2)</td><td></td><td></td><td>Whether block] is inside of block2</td></tr><tr><td>; block1. intersect (block2)</td><td></td><td></td><td>Return the intersection region of block and block2. . . . Coordinate type to be determined based on the inputs.</td></tr><tr><td>; block1.union(block2)</td><td></td><td></td><td>Return the union region of block1 and block2. . . . Coordinate type to be determined based on the inputs.</td></tr><tr><td>block1.relative_to(block2)</td><td></td><td></td><td>Convert the absolute coordinates of block to ' ' relative coordinates to block2</td></tr><tr><td>. block1.condition_on(block2)</td><td></td><td></td><td>Calculate the absolute coordinates of block1 given . the canvas block2’s absolute coordinates</td></tr><tr><td>block. crop_image (image)</td><td></td><td></td><td>Obtain the image segments in the block region</td></tr></table>"
+      "text_as_html": "<table><thead><th>block.pad(top, bottom,</th><th>right,</th><th>left)</th><th>Enlarge the current block according to the input</th></thead><tr><td>block.scale(fx, fy)</td><td></td><td></td><td>Scale the current block given the ratio in x and y direction</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>block1.is_in(block2)</td><td></td><td></td><td>Whether block] is inside of block2</td></tr><tr><td>block1. intersect (block2)</td><td></td><td></td><td>Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.union(block2)</td><td></td><td></td><td>Return the union region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.relative_to(block2)</td><td></td><td></td><td>Convert the absolute coordinates of block to relative coordinates to block2</td></tr><tr><td>block1.condition_on(block2)</td><td></td><td></td><td>Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates</td></tr><tr><td>block. crop_image (image)</td><td></td><td></td><td>Obtain the image segments in the block region</td></tr></table>"
     },
     "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input Scale the current block given the ratio in x and y direction block.scale(fx, fy) Move the current block with the shift distances in x and y direction block.shift(dx, dy) Whether block1 is inside of block2 block1.is in(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.intersect(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.relative to(block2) Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates block1.condition on(block2) Obtain the image segments in the block region block.crop image(image)"
   },
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index ce3efec6e..d52d8591b 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.30-dev4"  # pragma: no cover
+__version__ = "0.10.30-dev5"  # pragma: no cover
diff --git a/unstructured/partition/ocr.py b/unstructured/partition/ocr.py
index 60d12b1b7..eccf48050 100644
--- a/unstructured/partition/ocr.py
+++ b/unstructured/partition/ocr.py
@@ -189,7 +189,6 @@ def supplement_page_layout_with_ocr(
     """
 
     ocr_agent = get_ocr_agent()
-    ocr_layout = None
     if ocr_mode == OCRMode.FULL_PAGE.value:
         ocr_layout = get_ocr_layout_from_image(
             image,
@@ -203,7 +202,8 @@ def supplement_page_layout_with_ocr(
     elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:
         for element in page_layout.elements:
             if element.text == "":
-                padded_element = pad_element_bboxes(element, padding=12)
+                padding = env_config.IMAGE_CROP_PAD
+                padded_element = pad_element_bboxes(element, padding=padding)
                 cropped_image = image.crop(
                     (
                         padded_element.bbox.x1,
@@ -229,18 +229,12 @@ def supplement_page_layout_with_ocr(
     # Note(yuming): use the OCR data from entire page OCR for table extraction
     if infer_table_structure:
         table_agent = init_table_agent()
-        if ocr_layout is None:
-            # Note(yuming): ocr_layout is None for individual_blocks ocr_mode
-            ocr_layout = get_ocr_layout_from_image(
-                image,
-                ocr_languages=ocr_languages,
-                ocr_agent=ocr_agent,
-            )
         page_layout.elements[:] = supplement_element_with_table_extraction(
             elements=cast(List[LayoutElement], page_layout.elements),
-            ocr_layout=ocr_layout,
             image=image,
             table_agent=table_agent,
+            ocr_languages=ocr_languages,
+            ocr_agent=ocr_agent,
         )
 
     return page_layout
@@ -248,9 +242,10 @@ def supplement_page_layout_with_ocr(
 
 def supplement_element_with_table_extraction(
     elements: List[LayoutElement],
-    ocr_layout: List[TextRegion],
     image: PILImage,
     table_agent: "UnstructuredTableTransformerModel",
+    ocr_languages: str = "eng",
+    ocr_agent: str = OCR_AGENT_TESSERACT,
 ) -> List[LayoutElement]:
     """Supplement the existing layout with table extraction. Any Table elements
     that are extracted will have a metadata field "text_as_html" where
@@ -268,59 +263,38 @@ def supplement_element_with_table_extraction(
                     padded_element.bbox.y2,
                 ),
             )
-            table_tokens = get_table_tokens_per_element(
-                padded_element,
-                ocr_layout,
+            table_tokens = get_table_tokens(
+                image=cropped_image, ocr_languages=ocr_languages, ocr_agent=ocr_agent
             )
             element.text_as_html = table_agent.predict(cropped_image, ocr_tokens=table_tokens)
     return elements
 
 
-def get_table_tokens_per_element(
-    table_element: LayoutElement,
-    ocr_layout: List[TextRegion],
+def get_table_tokens(
+    image: PILImage,
+    ocr_languages: str = "eng",
+    ocr_agent: str = OCR_AGENT_TESSERACT,
 ) -> List[Dict]:
-    """
-    Extract and prepare table tokens within the specified table element
-    based on the OCR layout of an entire image.
+    """Get OCR tokens from either paddleocr or tesseract"""
 
-    Parameters:
-    - table_element (LayoutElement): The table element for which table tokens
-      should be extracted. It typically represents the bounding box of the table.
-    - ocr_layout (List[TextRegion]): A list of TextRegion objects representing
-      the OCR layout of the entire image.
-
-    Returns:
-    - List[Dict]: A list of dictionaries, each containing information about a table
-      token within the specified table element. Each dictionary includes the
-      following fields:
-        - 'bbox': A list of four coordinates [x1, y1, x2, y2]
-                    relative to the table element's bounding box.
-        - 'text': The text content of the table token.
-        - 'span_num': (Optional) The span number of the table token.
-        - 'line_num': (Optional) The line number of the table token.
-        - 'block_num': (Optional) The block number of the table token.
-    """
-    # TODO(yuming): update table_tokens from List[Dict] to List[TABLE_TOKEN]
-    # where TABLE_TOKEN will be a data class defined in unstructured-inference
+    ocr_layout = get_ocr_layout_from_image(
+        image,
+        ocr_languages=ocr_languages,
+        ocr_agent=ocr_agent,
+    )
     table_tokens = []
     for ocr_region in ocr_layout:
-        if ocr_region.bbox.is_in(
-            table_element.bbox,
-            error_margin=env_config.TABLE_TOKEN_ERROR_MARGIN,
-        ):
-            table_tokens.append(
-                {
-                    "bbox": [
-                        # token bound box is relative to table element
-                        ocr_region.bbox.x1 - table_element.bbox.x1,
-                        ocr_region.bbox.y1 - table_element.bbox.y1,
-                        ocr_region.bbox.x2 - table_element.bbox.x1,
-                        ocr_region.bbox.y2 - table_element.bbox.y1,
-                    ],
-                    "text": ocr_region.text,
-                },
-            )
+        table_tokens.append(
+            {
+                "bbox": [
+                    ocr_region.bbox.x1,
+                    ocr_region.bbox.y1,
+                    ocr_region.bbox.x2,
+                    ocr_region.bbox.y2,
+                ],
+                "text": ocr_region.text,
+            }
+        )
 
     # 'table_tokens' is a list of tokens
     # Need to be in a relative reading order
@@ -496,7 +470,7 @@ def get_ocr_layout_tesseract(
         text_height < env_config.TESSERACT_MIN_TEXT_HEIGHT
         or text_height > env_config.TESSERACT_MAX_TEXT_HEIGHT
     ):
-        # rounding avoids unnecessary precision and potential numerical issues assocaited
+        # rounding avoids unnecessary precision and potential numerical issues associated
         # with numbers very close to 1 inside cv2 image processing
         zoom = np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1)
         ocr_df = unstructured_pytesseract.image_to_data(
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
index c8738d7da..d61555d7e 100644
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@@ -72,16 +72,9 @@ class ENVConfig:
         """optimum text height for tesseract OCR"""
         return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
 
-    @property
-    def TABLE_TOKEN_ERROR_MARGIN(self) -> float:
-        """error margin when comparing if a ocr region is within the table element when perparing
-        table tokens
-        """
-        return self._get_float("TABLE_TOKEN_ERROR_MARGIN", 0.0)
-
     @property
     def OCR_AGENT(self) -> str:
-        """error margin when comparing if a ocr region is within the table element when perparing
+        """error margin when comparing if a ocr region is within the table element when preparing
         table tokens
         """
         return self._get_string("OCR_AGENT", OCR_AGENT_TESSERACT)