Fix: missing characters at the beginning of sentences on table ingest output after table OCR refactor (#1961)

Closes #1875.

### Summary
- add functionality to do a second OCR on cropped table images
- use `IMAGE_CROP_PAD` env for `individual_blocks` mode
### Testing
The test function
[`test_partition_pdf_hi_res_ocr_mode_with_table_extraction()`](https://github.com/Unstructured-IO/unstructured/blob/main/test_unstructured/partition/pdf_image/test_pdf.py#L425)
in `test_pdf.py` should pass.

### NOTE: 
I've tried to experiment with values for scaling ENVs on the following
PRs but found that changes to the values for scaling ENVs affect the
entire page OCR output(OCR regression) so switched to doing a second OCR
for tables.
- https://github.com/Unstructured-IO/unstructured/pull/1998/files 
- https://github.com/Unstructured-IO/unstructured/pull/2004/files
- https://github.com/Unstructured-IO/unstructured/pull/2016/files
- https://github.com/Unstructured-IO/unstructured/pull/2029/files

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
This commit is contained in:
Christine Straub 2023-11-09 10:29:55 -08:00 committed by GitHub
parent bb58c1bb0b
commit 3fe480799a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 63 additions and 93 deletions

View File

@ -1,4 +1,4 @@
## 0.10.30-dev4
## 0.10.30-dev5
### Enhancements
@ -7,6 +7,7 @@
### Features
* **Add functionality to do a second OCR on cropped table images.** Changes to the values for scaling ENVs affect entire page OCR output(OCR regression) so we now do a second OCR for tables.
* **Adds ability to pass timeout for a request when partitioning via a `url`.** `partition` now accepts a new optional parameter `request_timeout` which if set will prevent any `requests.get` from hanging indefinitely and instead will raise a timeout error. This is useful when partitioning a url that may be slow to respond or may not respond at all.
### Fixes

View File

@ -515,9 +515,8 @@ def test_partition_image_uses_model_name():
def test_partition_image_hi_res_ocr_mode(ocr_mode, idx_title_element):
filename = "example-docs/layout-parser-paper-fast.jpg"
elements = image.partition_image(filename=filename, ocr_mode=ocr_mode, strategy="hi_res")
first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
# Note(yuming): idx_title_element is different based on xy-cut and ocr mode
assert elements[idx_title_element].text == first_line
assert elements[idx_title_element].category == ElementType.TITLE
def test_partition_image_hi_res_invalid_ocr_mode():

View File

@ -1,3 +1,5 @@
from unittest.mock import patch
import numpy as np
import pandas as pd
import pytest
@ -437,7 +439,7 @@ def table_element():
@pytest.fixture()
def ocr_layout():
def mock_ocr_layout():
ocr_regions = [
TextRegion.from_coords(x1=15, y1=25, x2=35, y2=45, text="Token1"),
TextRegion.from_coords(x1=40, y1=30, x2=45, y2=50, text="Token2"),
@ -445,23 +447,24 @@ def ocr_layout():
return ocr_regions
def test_get_table_tokens_per_element(table_element, ocr_layout):
table_tokens = ocr.get_table_tokens_per_element(table_element, ocr_layout)
expected_tokens = [
{
"bbox": [5, 5, 25, 25],
"text": "Token1",
"span_num": 0,
"line_num": 0,
"block_num": 0,
},
{
"bbox": [30, 10, 35, 30],
"text": "Token2",
"span_num": 1,
"line_num": 0,
"block_num": 0,
},
]
def test_get_table_tokens(mock_ocr_layout):
with patch.object(ocr, "get_ocr_layout_from_image", return_value=mock_ocr_layout):
table_tokens = ocr.get_table_tokens(image=None)
expected_tokens = [
{
"bbox": [15, 25, 35, 45],
"text": "Token1",
"span_num": 0,
"line_num": 0,
"block_num": 0,
},
{
"bbox": [40, 30, 45, 50],
"text": "Token2",
"span_num": 1,
"line_num": 0,
"block_num": 0,
},
]
assert table_tokens == expected_tokens
assert table_tokens == expected_tokens

View File

@ -434,8 +434,8 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
assert len(table) == 2
assert "<table><thead><th>" in table[0]
assert "Layouts of history Japanese documents" in table[0]
# FIXME(yuming): comment this out since there are some table regression issue
# assert "Layouts of scanned modern magazines and scientific reports" in table[0]
assert "Layouts of scanned modern magazines and scientific report" in table[0]
assert "Layouts of scanned US newspapers from the 20th century" in table[0]
def test_partition_pdf_with_copy_protection():

View File

@ -47,7 +47,7 @@
},
"filetype": "image/jpeg",
"page_number": 1,
"text_as_html": "<table><thead><th>Dataset</th><th>| Base Model\"</th><th>Large Model</th><th>| Notes</th></thead><tr><td>PubLayNet [38]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3)</td><td>M</td><td>-</td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td>-</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank (18)</td><td>P</td><td>P</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset (31)</td><td>| F/M</td><td>-</td><td>Layouts of history Japanese documents</td></tr></table>"
"text_as_html": "<table><thead><th>Dataset</th><th>| Base Model!|</th><th>Large Model</th><th>| Notes</th></thead><tr><td>PubLayNet [33]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3]</td><td>M</td><td></td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td></td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank [18]</td><td>P</td><td></td><td>Table region on modern scientific and business document</td></tr><tr><td>HIDataset [31]</td><td>P/M</td><td></td><td>Layouts of history Japanese documents</td></tr></table>"
},
"text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents"
},

View File

@ -659,7 +659,7 @@
},
"filetype": "application/pdf",
"page_number": 5,
"text_as_html": "<table><thead><th>Dataset</th><th>| Base Model'|</th><th>Notes</th></thead><tr><td>PubLayNet B8]|</td><td>F/M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA</td><td>M</td><td>nned modern magazines and scientific reports</td></tr><tr><td>Newspapei</td><td>F</td><td>canned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset</td><td>F/M</td><td>Layouts of history Japanese documents</td></tr></table>"
"text_as_html": "<table><thead><th>Dataset</th><th>| Base Model'|</th><th>| Notes</th></thead><tr><td>PubLayNet B8]|</td><td>F/M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA</td><td>M</td><td>Layouts of scanned modern magazines and scientific report</td></tr><tr><td>Newspaper</td><td>F</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset</td><td>F/M</td><td>Layouts of history Japanese documents</td></tr></table>"
},
"text": "Base Model1 Large Model Notes Dataset PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents"
},
@ -1085,7 +1085,7 @@
},
"filetype": "application/pdf",
"page_number": 8,
"text_as_html": "<table><thead><th>block.pad(top, bottom,</th><th>right,</th><th>left)</th><th>Enlarge the current block according to the input</th></thead><tr><td>block.scale(fx, fy)</td><td></td><td></td><td>Scale the current block given the ratio ion in x and y di</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>block1.is_in(block2)</td><td></td><td></td><td>Whether block] is inside of block2</td></tr><tr><td>; block1. intersect (block2)</td><td></td><td></td><td>Return the intersection region of block and block2. . . . Coordinate type to be determined based on the inputs.</td></tr><tr><td>; block1.union(block2)</td><td></td><td></td><td>Return the union region of block1 and block2. . . . Coordinate type to be determined based on the inputs.</td></tr><tr><td>block1.relative_to(block2)</td><td></td><td></td><td>Convert the absolute coordinates of block to ' ' relative coordinates to block2</td></tr><tr><td>. block1.condition_on(block2)</td><td></td><td></td><td>Calculate the absolute coordinates of block1 given . the canvas block2s absolute coordinates</td></tr><tr><td>block. crop_image (image)</td><td></td><td></td><td>Obtain the image segments in the block region</td></tr></table>"
"text_as_html": "<table><thead><th>block.pad(top, bottom,</th><th>right,</th><th>left)</th><th>Enlarge the current block according to the input</th></thead><tr><td>block.scale(fx, fy)</td><td></td><td></td><td>Scale the current block given the ratio in x and y direction</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>block1.is_in(block2)</td><td></td><td></td><td>Whether block] is inside of block2</td></tr><tr><td>block1. intersect (block2)</td><td></td><td></td><td>Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.union(block2)</td><td></td><td></td><td>Return the union region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.relative_to(block2)</td><td></td><td></td><td>Convert the absolute coordinates of block to relative coordinates to block2</td></tr><tr><td>block1.condition_on(block2)</td><td></td><td></td><td>Calculate the absolute coordinates of blockl given the canvas block2s absolute coordinates</td></tr><tr><td>block. crop_image (image)</td><td></td><td></td><td>Obtain the image segments in the block region</td></tr></table>"
},
"text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input Scale the current block given the ratio in x and y direction block.scale(fx, fy) Move the current block with the shift distances in x and y direction block.shift(dx, dy) Whether block1 is inside of block2 block1.is in(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.intersect(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.relative to(block2) Calculate the absolute coordinates of block1 given the canvas block2s absolute coordinates block1.condition on(block2) Obtain the image segments in the block region block.crop image(image)"
},

View File

@ -1 +1 @@
__version__ = "0.10.30-dev4" # pragma: no cover
__version__ = "0.10.30-dev5" # pragma: no cover

View File

@ -189,7 +189,6 @@ def supplement_page_layout_with_ocr(
"""
ocr_agent = get_ocr_agent()
ocr_layout = None
if ocr_mode == OCRMode.FULL_PAGE.value:
ocr_layout = get_ocr_layout_from_image(
image,
@ -203,7 +202,8 @@ def supplement_page_layout_with_ocr(
elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:
for element in page_layout.elements:
if element.text == "":
padded_element = pad_element_bboxes(element, padding=12)
padding = env_config.IMAGE_CROP_PAD
padded_element = pad_element_bboxes(element, padding=padding)
cropped_image = image.crop(
(
padded_element.bbox.x1,
@ -229,18 +229,12 @@ def supplement_page_layout_with_ocr(
# Note(yuming): use the OCR data from entire page OCR for table extraction
if infer_table_structure:
table_agent = init_table_agent()
if ocr_layout is None:
# Note(yuming): ocr_layout is None for individual_blocks ocr_mode
ocr_layout = get_ocr_layout_from_image(
image,
ocr_languages=ocr_languages,
ocr_agent=ocr_agent,
)
page_layout.elements[:] = supplement_element_with_table_extraction(
elements=cast(List[LayoutElement], page_layout.elements),
ocr_layout=ocr_layout,
image=image,
table_agent=table_agent,
ocr_languages=ocr_languages,
ocr_agent=ocr_agent,
)
return page_layout
@ -248,9 +242,10 @@ def supplement_page_layout_with_ocr(
def supplement_element_with_table_extraction(
elements: List[LayoutElement],
ocr_layout: List[TextRegion],
image: PILImage,
table_agent: "UnstructuredTableTransformerModel",
ocr_languages: str = "eng",
ocr_agent: str = OCR_AGENT_TESSERACT,
) -> List[LayoutElement]:
"""Supplement the existing layout with table extraction. Any Table elements
that are extracted will have a metadata field "text_as_html" where
@ -268,59 +263,38 @@ def supplement_element_with_table_extraction(
padded_element.bbox.y2,
),
)
table_tokens = get_table_tokens_per_element(
padded_element,
ocr_layout,
table_tokens = get_table_tokens(
image=cropped_image, ocr_languages=ocr_languages, ocr_agent=ocr_agent
)
element.text_as_html = table_agent.predict(cropped_image, ocr_tokens=table_tokens)
return elements
def get_table_tokens_per_element(
table_element: LayoutElement,
ocr_layout: List[TextRegion],
def get_table_tokens(
image: PILImage,
ocr_languages: str = "eng",
ocr_agent: str = OCR_AGENT_TESSERACT,
) -> List[Dict]:
"""
Extract and prepare table tokens within the specified table element
based on the OCR layout of an entire image.
"""Get OCR tokens from either paddleocr or tesseract"""
Parameters:
- table_element (LayoutElement): The table element for which table tokens
should be extracted. It typically represents the bounding box of the table.
- ocr_layout (List[TextRegion]): A list of TextRegion objects representing
the OCR layout of the entire image.
Returns:
- List[Dict]: A list of dictionaries, each containing information about a table
token within the specified table element. Each dictionary includes the
following fields:
- 'bbox': A list of four coordinates [x1, y1, x2, y2]
relative to the table element's bounding box.
- 'text': The text content of the table token.
- 'span_num': (Optional) The span number of the table token.
- 'line_num': (Optional) The line number of the table token.
- 'block_num': (Optional) The block number of the table token.
"""
# TODO(yuming): update table_tokens from List[Dict] to List[TABLE_TOKEN]
# where TABLE_TOKEN will be a data class defined in unstructured-inference
ocr_layout = get_ocr_layout_from_image(
image,
ocr_languages=ocr_languages,
ocr_agent=ocr_agent,
)
table_tokens = []
for ocr_region in ocr_layout:
if ocr_region.bbox.is_in(
table_element.bbox,
error_margin=env_config.TABLE_TOKEN_ERROR_MARGIN,
):
table_tokens.append(
{
"bbox": [
# token bound box is relative to table element
ocr_region.bbox.x1 - table_element.bbox.x1,
ocr_region.bbox.y1 - table_element.bbox.y1,
ocr_region.bbox.x2 - table_element.bbox.x1,
ocr_region.bbox.y2 - table_element.bbox.y1,
],
"text": ocr_region.text,
},
)
table_tokens.append(
{
"bbox": [
ocr_region.bbox.x1,
ocr_region.bbox.y1,
ocr_region.bbox.x2,
ocr_region.bbox.y2,
],
"text": ocr_region.text,
}
)
# 'table_tokens' is a list of tokens
# Need to be in a relative reading order
@ -496,7 +470,7 @@ def get_ocr_layout_tesseract(
text_height < env_config.TESSERACT_MIN_TEXT_HEIGHT
or text_height > env_config.TESSERACT_MAX_TEXT_HEIGHT
):
# rounding avoids unnecessary precision and potential numerical issues assocaited
# rounding avoids unnecessary precision and potential numerical issues associated
# with numbers very close to 1 inside cv2 image processing
zoom = np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1)
ocr_df = unstructured_pytesseract.image_to_data(

View File

@ -72,16 +72,9 @@ class ENVConfig:
"""optimum text height for tesseract OCR"""
return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
@property
def TABLE_TOKEN_ERROR_MARGIN(self) -> float:
"""error margin when comparing if a ocr region is within the table element when perparing
table tokens
"""
return self._get_float("TABLE_TOKEN_ERROR_MARGIN", 0.0)
@property
def OCR_AGENT(self) -> str:
"""error margin when comparing if a ocr region is within the table element when perparing
"""error margin when comparing if a ocr region is within the table element when preparing
table tokens
"""
return self._get_string("OCR_AGENT", OCR_AGENT_TESSERACT)