Chore: stop passing extract_tables to inference and note table regression on entire doc OCR (#1850)

### Summary

A follow up ticket on
https://github.com/Unstructured-IO/unstructured/pull/1801, I forgot to
remove the lines that pass extract_tables to inference, and noted the
table regression if we only do one OCR for entire doc

**Tech details:**
* stop passing `extract_tables` parameter to inference
* added table extraction ingest test for image, which was skipped
before, and the "text_as_html" field contains the OCR output from the
table OCR refactor PR
* replaced `assert_called_once_with` with `call_args` so that the unit
tests don't need to test additional parameters
* added `error_margin` as ENV when comparing bounding boxes
of`ocr_region` with `table_element`
* added more tests for tables and noted the table regression in test for
partition pdf

### Test
* for stop passing `extract_tables` parameter to inference, run test
`test_partition_pdf_hi_res_ocr_mode_with_table_extraction` before this
branch and you will see warning like `Table OCR from get_tokens method
will be deprecated....`, which means it called the table OCR in
inference repo. This branch removed the warning.
This commit is contained in:
Yuming Long 2023-10-24 13:13:28 -04:00 committed by GitHub
parent 44cef80c82
commit 01a0e003d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 50 additions and 37 deletions

View File

@ -1,4 +1,4 @@
## 0.10.26-dev3
## 0.10.26-dev4
### Enhancements
@ -10,6 +10,8 @@
### Fixes
* ** Stop passing `extract_tables` to unstructured-inference ** since it is now supported in unstructured instead. Also noted the table
output regressioin for PDF files.
* **Fix a bug on Table partitioning** Previously the `skip_infer_table_types` variable used in partition was not being passed down to specific file partitioners. Now you can utilize the `skip_infer_table_types` list variable in partition to pass the filetype you want to exclude `text_as_html` metadata field for, or the `infer_table_structure` boolean variable on the file specific partitioning function.
* **Fix partition docx without sections** Some docx files, like those from teams output, do not contain sections and it would produce no results because the code assumes all components are in sections. Now if no sections is detected from a document we iterate through the paragraphs and return contents found in the paragraphs.

View File

@ -54,4 +54,5 @@ docker run --rm -v "$SCRIPT_DIR"/../unstructured:/root/unstructured \
./test_unstructured_ingest/test-ingest-biomed-path.sh &&
./test_unstructured_ingest/test-ingest-s3.sh &&
./test_unstructured_ingest/test-ingest-slack.sh &&
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh"
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh &&
./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh"

View File

@ -540,6 +540,7 @@ def test_partition_image_hi_res_ocr_mode_with_table_extraction(ocr_mode):
assert len(table) == 1
assert "<table><thead><th>" in table[0]
assert "Layouts of history Japanese documents" in table[0]
assert "Layouts of scanned modern magazines and scientific reports" in table[0]
def test_partition_image_raises_TypeError_for_invalid_languages():

View File

@ -177,15 +177,7 @@ def test_partition_pdf_with_model_name_env_var(
mock.MagicMock(),
) as mock_process:
pdf.partition_pdf(filename=filename, strategy="hi_res")
mock_process.assert_called_once_with(
filename,
is_image=False,
pdf_image_dpi=mock.ANY,
extract_tables=mock.ANY,
model_name="checkbox",
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
)
assert mock_process.call_args[1]["model_name"] == "checkbox"
def test_partition_pdf_with_model_name(
@ -199,15 +191,7 @@ def test_partition_pdf_with_model_name(
mock.MagicMock(),
) as mock_process:
pdf.partition_pdf(filename=filename, strategy="hi_res", model_name="checkbox")
mock_process.assert_called_once_with(
filename,
is_image=False,
pdf_image_dpi=mock.ANY,
extract_tables=mock.ANY,
model_name="checkbox",
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
)
assert mock_process.call_args[1]["model_name"] == "checkbox"
def test_partition_pdf_with_auto_strategy(
@ -428,6 +412,29 @@ def test_partition_pdf_hi_table_extraction_with_languages(ocr_mode):
assert "" in table[0]
@pytest.mark.parametrize(
("ocr_mode"),
[
("entire_page"),
("individual_blocks"),
],
)
def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
filename = "example-docs/layout-parser-paper.pdf"
elements = pdf.partition_pdf(
filename=filename,
ocr_mode=ocr_mode,
strategy="hi_res",
infer_table_structure=True,
)
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
assert len(table) == 2
assert "<table><thead><th>" in table[0]
assert "Layouts of history Japanese documents" in table[0]
# FIXME(yuming): comment this out since there are some table regression issue
# assert "Layouts of scanned modern magazines and scientific reports" in table[0]
def test_partition_pdf_with_copy_protection():
filename = os.path.join("example-docs", "copy-protected.pdf")
elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
@ -443,15 +450,7 @@ def test_partition_pdf_with_dpi():
filename = os.path.join("example-docs", "copy-protected.pdf")
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
pdf.partition_pdf(filename=filename, strategy="hi_res", pdf_image_dpi=100)
mock_process.assert_called_once_with(
filename,
is_image=False,
extract_tables=mock.ANY,
model_name=pdf.default_hi_res_model(),
pdf_image_dpi=100,
extract_images_in_pdf=mock.ANY,
image_output_dir_path=mock.ANY,
)
assert mock_process.call_args[1]["pdf_image_dpi"] == 100
def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"):

View File

@ -736,7 +736,9 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x
],
)
def test_auto_partition_respects_skip_infer_table_types(
skip_infer_table_types, filename, has_text_as_html_field
skip_infer_table_types,
filename,
has_text_as_html_field,
):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
with open(filename, "rb") as f:

View File

@ -46,7 +46,8 @@
]
},
"filetype": "image/jpeg",
"page_number": 1
"page_number": 1,
"text_as_html": "<table><thead><th>Dataset</th><th>| Base Model\"</th><th>Large Model</th><th>| Notes</th></thead><tr><td>PubLayNet [38]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3)</td><td>M</td><td>-</td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td>-</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank (18)</td><td>P</td><td>P</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset (31)</td><td>| F/M</td><td>-</td><td>Layouts of history Japanese documents</td></tr></table>"
},
"text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents"
},

View File

@ -22,7 +22,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--skip-infer-table-types "jpg" \
--skip-infer-table-types "xls,xlsx" \
--pdf-infer-table-structure true \
--strategy hi_res \
--verbose \

View File

@ -1 +1 @@
__version__ = "0.10.26-dev3" # pragma: no cover
__version__ = "0.10.26-dev4" # pragma: no cover

View File

@ -306,7 +306,10 @@ def get_table_tokens_per_element(
# where TABLE_TOKEN will be a data class defined in unstructured-inference
table_tokens = []
for ocr_region in ocr_layout:
if ocr_region.bbox.is_in(table_element.bbox):
if ocr_region.bbox.is_in(
table_element.bbox,
error_margin=env_config.TABLE_TOKEN_ERROR_MARGIN,
):
table_tokens.append(
{
"bbox": [
@ -330,7 +333,6 @@ def get_table_tokens_per_element(
token["line_num"] = 0
if "block_num" not in token:
token["block_num"] = 0
return table_tokens

View File

@ -380,7 +380,6 @@ def _partition_pdf_or_image_local(
out_layout = process_file_with_model(
filename,
is_image=is_image,
extract_tables=infer_table_structure,
model_name=model_name,
pdf_image_dpi=pdf_image_dpi,
extract_images_in_pdf=extract_images_in_pdf,
@ -403,7 +402,6 @@ def _partition_pdf_or_image_local(
out_layout = process_data_with_model(
file,
is_image=is_image,
extract_tables=infer_table_structure,
model_name=model_name,
pdf_image_dpi=pdf_image_dpi,
extract_images_in_pdf=extract_images_in_pdf,

View File

@ -61,5 +61,12 @@ class ENVConfig:
"""optimum text height for tesseract OCR"""
return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
@property
def TABLE_TOKEN_ERROR_MARGIN(self) -> float:
"""error margin when comparing if a ocr region is within the table element when perparing
table tokens
"""
return self._get_float("TABLE_TOKEN_ERROR_MARGIN", 0.0)
env_config = ENVConfig()