From 3fe480799ac577d0b15fea4d33d6409258c4e136 Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Thu, 9 Nov 2023 10:29:55 -0800 Subject: [PATCH] Fix: missing characters at the beginning of sentences on table ingest output after table OCR refactor (#1961) Closes #1875. ### Summary - add functionality to do a second OCR on cropped table images - use `IMAGE_CROP_PAD` env for `individual_blocks` mode ### Testing The test function [`test_partition_pdf_hi_res_ocr_mode_with_table_extraction()`](https://github.com/Unstructured-IO/unstructured/blob/main/test_unstructured/partition/pdf_image/test_pdf.py#L425) in `test_pdf.py` should pass. ### NOTE: I've tried to experiment with values for scaling ENVs on the following PRs but found that changes to the values for scaling ENVs affect the entire page OCR output(OCR regression) so switched to doing a second OCR for tables. - https://github.com/Unstructured-IO/unstructured/pull/1998/files - https://github.com/Unstructured-IO/unstructured/pull/2004/files - https://github.com/Unstructured-IO/unstructured/pull/2016/files - https://github.com/Unstructured-IO/unstructured/pull/2029/files --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub --- CHANGELOG.md | 3 +- .../partition/pdf_image/test_image.py | 3 +- .../partition/pdf_image/test_ocr.py | 43 +++++----- .../partition/pdf_image/test_pdf.py | 4 +- .../layout-parser-paper-with-table.jpg.json | 2 +- .../layout-parser-paper.pdf.json | 4 +- unstructured/__version__.py | 2 +- unstructured/partition/ocr.py | 86 +++++++------------ unstructured/partition/utils/config.py | 9 +- 9 files changed, 63 insertions(+), 93 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4014131f4..da504fa79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.30-dev4 +## 0.10.30-dev5 ### Enhancements @@ -7,6 +7,7 @@ ### Features +* **Add functionality to do a second OCR on cropped table images.** Changes to the values for scaling ENVs affect entire page OCR output(OCR regression) so we now do a second OCR for tables. * **Adds ability to pass timeout for a request when partitioning via a `url`.** `partition` now accepts a new optional parameter `request_timeout` which if set will prevent any `requests.get` from hanging indefinitely and instead will raise a timeout error. This is useful when partitioning a url that may be slow to respond or may not respond at all. ### Fixes diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py index 46f528bac..86a0a7a77 100644 --- a/test_unstructured/partition/pdf_image/test_image.py +++ b/test_unstructured/partition/pdf_image/test_image.py @@ -515,9 +515,8 @@ def test_partition_image_uses_model_name(): def test_partition_image_hi_res_ocr_mode(ocr_mode, idx_title_element): filename = "example-docs/layout-parser-paper-fast.jpg" elements = image.partition_image(filename=filename, ocr_mode=ocr_mode, strategy="hi_res") - first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis" # Note(yuming): idx_title_element is different based on xy-cut and ocr mode - assert elements[idx_title_element].text == first_line + assert elements[idx_title_element].category == ElementType.TITLE def test_partition_image_hi_res_invalid_ocr_mode(): diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index cc2637fb7..4d94cd610 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -1,3 +1,5 @@ +from unittest.mock import patch + import numpy as np import pandas as pd import pytest @@ -437,7 +439,7 @@ def table_element(): @pytest.fixture() -def ocr_layout(): +def mock_ocr_layout(): ocr_regions = [ TextRegion.from_coords(x1=15, y1=25, x2=35, y2=45, text="Token1"), TextRegion.from_coords(x1=40, y1=30, x2=45, y2=50, text="Token2"), @@ -445,23 +447,24 @@ def ocr_layout(): return ocr_regions -def test_get_table_tokens_per_element(table_element, ocr_layout): - table_tokens = ocr.get_table_tokens_per_element(table_element, ocr_layout) - expected_tokens = [ - { - "bbox": [5, 5, 25, 25], - "text": "Token1", - "span_num": 0, - "line_num": 0, - "block_num": 0, - }, - { - "bbox": [30, 10, 35, 30], - "text": "Token2", - "span_num": 1, - "line_num": 0, - "block_num": 0, - }, - ] +def test_get_table_tokens(mock_ocr_layout): + with patch.object(ocr, "get_ocr_layout_from_image", return_value=mock_ocr_layout): + table_tokens = ocr.get_table_tokens(image=None) + expected_tokens = [ + { + "bbox": [15, 25, 35, 45], + "text": "Token1", + "span_num": 0, + "line_num": 0, + "block_num": 0, + }, + { + "bbox": [40, 30, 45, 50], + "text": "Token2", + "span_num": 1, + "line_num": 0, + "block_num": 0, + }, + ] - assert table_tokens == expected_tokens + assert table_tokens == expected_tokens diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 7b5babda3..8273986c2 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -434,8 +434,8 @@ def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode): assert len(table) == 2 assert "
" in table[0] assert "Layouts of history Japanese documents" in table[0] - # FIXME(yuming): comment this out since there are some table regression issue - # assert "Layouts of scanned modern magazines and scientific reports" in table[0] + assert "Layouts of scanned modern magazines and scientific report" in table[0] + assert "Layouts of scanned US newspapers from the 20th century" in table[0] def test_partition_pdf_with_copy_protection(): diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json index ededa714b..39d88751a 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json @@ -47,7 +47,7 @@ }, "filetype": "image/jpeg", "page_number": 1, - "text_as_html": "
Dataset| Base Model\"Large Model| Notes
PubLayNet [38]P/MMLayouts of modern scientific documents
PRImA [3)M-Layouts of scanned modern magazines and scientific reports
Newspaper [17]P-Layouts of scanned US newspapers from the 20th century
‘TableBank (18)PPTable region on modern scientific and business document
HJDataset (31)| F/M-Layouts of history Japanese documents
" + "text_as_html": "
Dataset| Base Model!|Large Model| Notes
PubLayNet [33]P/MMLayouts of modern scientific documents
PRImA [3]MLayouts of scanned modern magazines and scientific reports
Newspaper [17]PLayouts of scanned US newspapers from the 20th century
TableBank [18]PTable region on modern scientific and business document
HIDataset [31]P/MLayouts of history Japanese documents
" }, "text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century ‘TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents" }, diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 3bec9e860..1851b878b 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -659,7 +659,7 @@ }, "filetype": "application/pdf", "page_number": 5, - "text_as_html": "
Dataset| Base Model'|Notes
PubLayNet B8]|F/MLayouts of modern scientific documents
PRImAMnned modern magazines and scientific reports
NewspapeiFcanned US newspapers from the 20th century
TableBankFTable region on modern scientific and business document
HJDatasetF/MLayouts of history Japanese documents
" + "text_as_html": "
Dataset| Base Model'|| Notes
PubLayNet B8]|F/MLayouts of modern scientific documents
PRImAMLayouts of scanned modern magazines and scientific report
NewspaperFLayouts of scanned US newspapers from the 20th century
TableBankFTable region on modern scientific and business document
HJDatasetF/MLayouts of history Japanese documents
" }, "text": "Base Model1 Large Model Notes Dataset PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents" }, @@ -1085,7 +1085,7 @@ }, "filetype": "application/pdf", "page_number": 8, - "text_as_html": "
block.pad(top, bottom,right,left)Enlarge the current block according to the input
block.scale(fx, fy)Scale the current block given the ratio ion in x and y di
block.shift(dx, dy)Move the current block with the shift distances in x and y direction
block1.is_in(block2)Whether block] is inside of block2
; block1. intersect (block2)Return the intersection region of block and block2. . . . Coordinate type to be determined based on the inputs.
; block1.union(block2)Return the union region of block1 and block2. . . . Coordinate type to be determined based on the inputs.
block1.relative_to(block2)Convert the absolute coordinates of block to ' ' relative coordinates to block2
. block1.condition_on(block2)Calculate the absolute coordinates of block1 given . the canvas block2’s absolute coordinates
block. crop_image (image)Obtain the image segments in the block region
" + "text_as_html": "
block.pad(top, bottom,right,left)Enlarge the current block according to the input
block.scale(fx, fy)Scale the current block given the ratio in x and y direction
block.shift(dx, dy)Move the current block with the shift distances in x and y direction
block1.is_in(block2)Whether block] is inside of block2
block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates
block. crop_image (image)Obtain the image segments in the block region
" }, "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input Scale the current block given the ratio in x and y direction block.scale(fx, fy) Move the current block with the shift distances in x and y direction block.shift(dx, dy) Whether block1 is inside of block2 block1.is in(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.intersect(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.relative to(block2) Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates block1.condition on(block2) Obtain the image segments in the block region block.crop image(image)" }, diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ce3efec6e..d52d8591b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.30-dev4" # pragma: no cover +__version__ = "0.10.30-dev5" # pragma: no cover diff --git a/unstructured/partition/ocr.py b/unstructured/partition/ocr.py index 60d12b1b7..eccf48050 100644 --- a/unstructured/partition/ocr.py +++ b/unstructured/partition/ocr.py @@ -189,7 +189,6 @@ def supplement_page_layout_with_ocr( """ ocr_agent = get_ocr_agent() - ocr_layout = None if ocr_mode == OCRMode.FULL_PAGE.value: ocr_layout = get_ocr_layout_from_image( image, @@ -203,7 +202,8 @@ def supplement_page_layout_with_ocr( elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value: for element in page_layout.elements: if element.text == "": - padded_element = pad_element_bboxes(element, padding=12) + padding = env_config.IMAGE_CROP_PAD + padded_element = pad_element_bboxes(element, padding=padding) cropped_image = image.crop( ( padded_element.bbox.x1, @@ -229,18 +229,12 @@ def supplement_page_layout_with_ocr( # Note(yuming): use the OCR data from entire page OCR for table extraction if infer_table_structure: table_agent = init_table_agent() - if ocr_layout is None: - # Note(yuming): ocr_layout is None for individual_blocks ocr_mode - ocr_layout = get_ocr_layout_from_image( - image, - ocr_languages=ocr_languages, - ocr_agent=ocr_agent, - ) page_layout.elements[:] = supplement_element_with_table_extraction( elements=cast(List[LayoutElement], page_layout.elements), - ocr_layout=ocr_layout, image=image, table_agent=table_agent, + ocr_languages=ocr_languages, + ocr_agent=ocr_agent, ) return page_layout @@ -248,9 +242,10 @@ def supplement_page_layout_with_ocr( def supplement_element_with_table_extraction( elements: List[LayoutElement], - ocr_layout: List[TextRegion], image: PILImage, table_agent: "UnstructuredTableTransformerModel", + ocr_languages: str = "eng", + ocr_agent: str = OCR_AGENT_TESSERACT, ) -> List[LayoutElement]: """Supplement the existing layout with table extraction. Any Table elements that are extracted will have a metadata field "text_as_html" where @@ -268,59 +263,38 @@ def supplement_element_with_table_extraction( padded_element.bbox.y2, ), ) - table_tokens = get_table_tokens_per_element( - padded_element, - ocr_layout, + table_tokens = get_table_tokens( + image=cropped_image, ocr_languages=ocr_languages, ocr_agent=ocr_agent ) element.text_as_html = table_agent.predict(cropped_image, ocr_tokens=table_tokens) return elements -def get_table_tokens_per_element( - table_element: LayoutElement, - ocr_layout: List[TextRegion], +def get_table_tokens( + image: PILImage, + ocr_languages: str = "eng", + ocr_agent: str = OCR_AGENT_TESSERACT, ) -> List[Dict]: - """ - Extract and prepare table tokens within the specified table element - based on the OCR layout of an entire image. + """Get OCR tokens from either paddleocr or tesseract""" - Parameters: - - table_element (LayoutElement): The table element for which table tokens - should be extracted. It typically represents the bounding box of the table. - - ocr_layout (List[TextRegion]): A list of TextRegion objects representing - the OCR layout of the entire image. - - Returns: - - List[Dict]: A list of dictionaries, each containing information about a table - token within the specified table element. Each dictionary includes the - following fields: - - 'bbox': A list of four coordinates [x1, y1, x2, y2] - relative to the table element's bounding box. - - 'text': The text content of the table token. - - 'span_num': (Optional) The span number of the table token. - - 'line_num': (Optional) The line number of the table token. - - 'block_num': (Optional) The block number of the table token. - """ - # TODO(yuming): update table_tokens from List[Dict] to List[TABLE_TOKEN] - # where TABLE_TOKEN will be a data class defined in unstructured-inference + ocr_layout = get_ocr_layout_from_image( + image, + ocr_languages=ocr_languages, + ocr_agent=ocr_agent, + ) table_tokens = [] for ocr_region in ocr_layout: - if ocr_region.bbox.is_in( - table_element.bbox, - error_margin=env_config.TABLE_TOKEN_ERROR_MARGIN, - ): - table_tokens.append( - { - "bbox": [ - # token bound box is relative to table element - ocr_region.bbox.x1 - table_element.bbox.x1, - ocr_region.bbox.y1 - table_element.bbox.y1, - ocr_region.bbox.x2 - table_element.bbox.x1, - ocr_region.bbox.y2 - table_element.bbox.y1, - ], - "text": ocr_region.text, - }, - ) + table_tokens.append( + { + "bbox": [ + ocr_region.bbox.x1, + ocr_region.bbox.y1, + ocr_region.bbox.x2, + ocr_region.bbox.y2, + ], + "text": ocr_region.text, + } + ) # 'table_tokens' is a list of tokens # Need to be in a relative reading order @@ -496,7 +470,7 @@ def get_ocr_layout_tesseract( text_height < env_config.TESSERACT_MIN_TEXT_HEIGHT or text_height > env_config.TESSERACT_MAX_TEXT_HEIGHT ): - # rounding avoids unnecessary precision and potential numerical issues assocaited + # rounding avoids unnecessary precision and potential numerical issues associated # with numbers very close to 1 inside cv2 image processing zoom = np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1) ocr_df = unstructured_pytesseract.image_to_data( diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index c8738d7da..d61555d7e 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -72,16 +72,9 @@ class ENVConfig: """optimum text height for tesseract OCR""" return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20) - @property - def TABLE_TOKEN_ERROR_MARGIN(self) -> float: - """error margin when comparing if a ocr region is within the table element when perparing - table tokens - """ - return self._get_float("TABLE_TOKEN_ERROR_MARGIN", 0.0) - @property def OCR_AGENT(self) -> str: - """error margin when comparing if a ocr region is within the table element when perparing + """error margin when comparing if a ocr region is within the table element when preparing table tokens """ return self._get_string("OCR_AGENT", OCR_AGENT_TESSERACT)