Fix: missing columns on table ingest output after table OCR refactor (#1959)

Closes #1873.
### Summary
Table OCR refactoring changed the default padding value for table image
cropping from
[12](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L95)
to
[0](https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/ocr.py#L260),
causing some columns in the table to be missing.
### Testing
```
filename = "example-docs/layout-parser-paper-with-table.pdf"
elements = pdf.partition_pdf(
    filename=filename,
    strategy="hi_res",
    infer_table_structure=True,
)
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
assert "Large Model" in table[0]
```

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
This commit is contained in:
Christine Straub 2023-11-01 11:34:27 -07:00 committed by GitHub
parent a06b151897
commit 210d53a7e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 17 additions and 6 deletions

View File

@ -1,4 +1,4 @@
## 0.10.29-dev3
## 0.10.29-dev5
### Enhancements
@ -10,6 +10,8 @@
### Features
* **Allow setting table crop parameter** In certain circumstances, adjusting the table crop padding may improve table.
### Fixes
* **Ingest session handler not being shared correctly** All ingest docs that leverage the session handler should only need to set it once per process. It was recreating it each time because the right values weren't being set nor available given how dataclasses work in python.

Binary file not shown.

View File

@ -416,10 +416,10 @@ def test_partition_pdf_hi_table_extraction_with_languages(ocr_mode):
@pytest.mark.parametrize(
("ocr_mode"),
"ocr_mode",
[
("entire_page"),
("individual_blocks"),
"entire_page",
"individual_blocks",
],
)
def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):

View File

@ -1 +1 @@
__version__ = "0.10.29-dev3" # pragma: no cover
__version__ = "0.10.29-dev5" # pragma: no cover

View File

@ -257,7 +257,7 @@ def supplement_element_with_table_extraction(
"""
for element in elements:
if element.type == "Table":
padding = env_config.IMAGE_CROP_PAD
padding = env_config.TABLE_IMAGE_CROP_PAD
padded_element = pad_element_bboxes(element, padding=padding)
cropped_image = image.crop(
(

View File

@ -35,6 +35,15 @@ class ENVConfig:
"""extra image content to add around an identified element region; measured in pixels"""
return self._get_int("IMAGE_CROP_PAD", 0)
@property
def TABLE_IMAGE_CROP_PAD(self) -> int:
"""extra image content to add around an identified table region; measured in pixels
The padding adds image data around an identified table bounding box for downstream table
structure detection model use as input
"""
return self._get_int("TABLE_IMAGE_CROP_PAD", 0)
@property
def TESSERACT_TEXT_HEIGHT_QUANTILE(self) -> float:
"""the quantile to check for text height"""