mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-29 08:05:08 +00:00
Fix: missing columns on table ingest output after table OCR refactor (#1959)
Closes #1873. ### Summary Table OCR refactoring changed the default padding value for table image cropping from [12](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L95) to [0](https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/ocr.py#L260), causing some columns in the table to be missing. ### Testing ``` filename = "example-docs/layout-parser-paper-with-table.pdf" elements = pdf.partition_pdf( filename=filename, strategy="hi_res", infer_table_structure=True, ) table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html] assert "Large Model" in table[0] ``` --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
This commit is contained in:
parent
a06b151897
commit
210d53a7e0
@ -1,4 +1,4 @@
|
||||
## 0.10.29-dev3
|
||||
## 0.10.29-dev5
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,8 @@
|
||||
|
||||
### Features
|
||||
|
||||
* **Allow setting table crop parameter** In certain circumstances, adjusting the table crop padding may improve table.
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Ingest session handler not being shared correctly** All ingest docs that leverage the session handler should only need to set it once per process. It was recreating it each time because the right values weren't being set nor available given how dataclasses work in python.
|
||||
|
||||
BIN
example-docs/layout-parser-paper-with-table.pdf
Normal file
BIN
example-docs/layout-parser-paper-with-table.pdf
Normal file
Binary file not shown.
@ -416,10 +416,10 @@ def test_partition_pdf_hi_table_extraction_with_languages(ocr_mode):
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("ocr_mode"),
|
||||
"ocr_mode",
|
||||
[
|
||||
("entire_page"),
|
||||
("individual_blocks"),
|
||||
"entire_page",
|
||||
"individual_blocks",
|
||||
],
|
||||
)
|
||||
def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.29-dev3" # pragma: no cover
|
||||
__version__ = "0.10.29-dev5" # pragma: no cover
|
||||
|
||||
@ -257,7 +257,7 @@ def supplement_element_with_table_extraction(
|
||||
"""
|
||||
for element in elements:
|
||||
if element.type == "Table":
|
||||
padding = env_config.IMAGE_CROP_PAD
|
||||
padding = env_config.TABLE_IMAGE_CROP_PAD
|
||||
padded_element = pad_element_bboxes(element, padding=padding)
|
||||
cropped_image = image.crop(
|
||||
(
|
||||
|
||||
@ -35,6 +35,15 @@ class ENVConfig:
|
||||
"""extra image content to add around an identified element region; measured in pixels"""
|
||||
return self._get_int("IMAGE_CROP_PAD", 0)
|
||||
|
||||
@property
|
||||
def TABLE_IMAGE_CROP_PAD(self) -> int:
|
||||
"""extra image content to add around an identified table region; measured in pixels
|
||||
|
||||
The padding adds image data around an identified table bounding box for downstream table
|
||||
structure detection model use as input
|
||||
"""
|
||||
return self._get_int("TABLE_IMAGE_CROP_PAD", 0)
|
||||
|
||||
@property
|
||||
def TESSERACT_TEXT_HEIGHT_QUANTILE(self) -> float:
|
||||
"""the quantile to check for text height"""
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user