mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Chore: stop passing extract_tables to inference and note table regression on entire doc OCR (#1850)
### Summary A follow up ticket on https://github.com/Unstructured-IO/unstructured/pull/1801, I forgot to remove the lines that pass extract_tables to inference, and noted the table regression if we only do one OCR for entire doc **Tech details:** * stop passing `extract_tables` parameter to inference * added table extraction ingest test for image, which was skipped before, and the "text_as_html" field contains the OCR output from the table OCR refactor PR * replaced `assert_called_once_with` with `call_args` so that the unit tests don't need to test additional parameters * added `error_margin` as ENV when comparing bounding boxes of`ocr_region` with `table_element` * added more tests for tables and noted the table regression in test for partition pdf ### Test * for stop passing `extract_tables` parameter to inference, run test `test_partition_pdf_hi_res_ocr_mode_with_table_extraction` before this branch and you will see warning like `Table OCR from get_tokens method will be deprecated....`, which means it called the table OCR in inference repo. This branch removed the warning.
This commit is contained in:
parent
44cef80c82
commit
01a0e003d9
@ -1,4 +1,4 @@
|
||||
## 0.10.26-dev3
|
||||
## 0.10.26-dev4
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* ** Stop passing `extract_tables` to unstructured-inference ** since it is now supported in unstructured instead. Also noted the table
|
||||
output regressioin for PDF files.
|
||||
* **Fix a bug on Table partitioning** Previously the `skip_infer_table_types` variable used in partition was not being passed down to specific file partitioners. Now you can utilize the `skip_infer_table_types` list variable in partition to pass the filetype you want to exclude `text_as_html` metadata field for, or the `infer_table_structure` boolean variable on the file specific partitioning function.
|
||||
* **Fix partition docx without sections** Some docx files, like those from teams output, do not contain sections and it would produce no results because the code assumes all components are in sections. Now if no sections is detected from a document we iterate through the paragraphs and return contents found in the paragraphs.
|
||||
|
||||
|
@ -54,4 +54,5 @@ docker run --rm -v "$SCRIPT_DIR"/../unstructured:/root/unstructured \
|
||||
./test_unstructured_ingest/test-ingest-biomed-path.sh &&
|
||||
./test_unstructured_ingest/test-ingest-s3.sh &&
|
||||
./test_unstructured_ingest/test-ingest-slack.sh &&
|
||||
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh"
|
||||
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh &&
|
||||
./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh"
|
||||
|
@ -540,6 +540,7 @@ def test_partition_image_hi_res_ocr_mode_with_table_extraction(ocr_mode):
|
||||
assert len(table) == 1
|
||||
assert "<table><thead><th>" in table[0]
|
||||
assert "Layouts of history Japanese documents" in table[0]
|
||||
assert "Layouts of scanned modern magazines and scientific reports" in table[0]
|
||||
|
||||
|
||||
def test_partition_image_raises_TypeError_for_invalid_languages():
|
||||
|
@ -177,15 +177,7 @@ def test_partition_pdf_with_model_name_env_var(
|
||||
mock.MagicMock(),
|
||||
) as mock_process:
|
||||
pdf.partition_pdf(filename=filename, strategy="hi_res")
|
||||
mock_process.assert_called_once_with(
|
||||
filename,
|
||||
is_image=False,
|
||||
pdf_image_dpi=mock.ANY,
|
||||
extract_tables=mock.ANY,
|
||||
model_name="checkbox",
|
||||
extract_images_in_pdf=mock.ANY,
|
||||
image_output_dir_path=mock.ANY,
|
||||
)
|
||||
assert mock_process.call_args[1]["model_name"] == "checkbox"
|
||||
|
||||
|
||||
def test_partition_pdf_with_model_name(
|
||||
@ -199,15 +191,7 @@ def test_partition_pdf_with_model_name(
|
||||
mock.MagicMock(),
|
||||
) as mock_process:
|
||||
pdf.partition_pdf(filename=filename, strategy="hi_res", model_name="checkbox")
|
||||
mock_process.assert_called_once_with(
|
||||
filename,
|
||||
is_image=False,
|
||||
pdf_image_dpi=mock.ANY,
|
||||
extract_tables=mock.ANY,
|
||||
model_name="checkbox",
|
||||
extract_images_in_pdf=mock.ANY,
|
||||
image_output_dir_path=mock.ANY,
|
||||
)
|
||||
assert mock_process.call_args[1]["model_name"] == "checkbox"
|
||||
|
||||
|
||||
def test_partition_pdf_with_auto_strategy(
|
||||
@ -428,6 +412,29 @@ def test_partition_pdf_hi_table_extraction_with_languages(ocr_mode):
|
||||
assert "업" in table[0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("ocr_mode"),
|
||||
[
|
||||
("entire_page"),
|
||||
("individual_blocks"),
|
||||
],
|
||||
)
|
||||
def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
|
||||
filename = "example-docs/layout-parser-paper.pdf"
|
||||
elements = pdf.partition_pdf(
|
||||
filename=filename,
|
||||
ocr_mode=ocr_mode,
|
||||
strategy="hi_res",
|
||||
infer_table_structure=True,
|
||||
)
|
||||
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
|
||||
assert len(table) == 2
|
||||
assert "<table><thead><th>" in table[0]
|
||||
assert "Layouts of history Japanese documents" in table[0]
|
||||
# FIXME(yuming): comment this out since there are some table regression issue
|
||||
# assert "Layouts of scanned modern magazines and scientific reports" in table[0]
|
||||
|
||||
|
||||
def test_partition_pdf_with_copy_protection():
|
||||
filename = os.path.join("example-docs", "copy-protected.pdf")
|
||||
elements = pdf.partition_pdf(filename=filename, strategy="hi_res")
|
||||
@ -443,15 +450,7 @@ def test_partition_pdf_with_dpi():
|
||||
filename = os.path.join("example-docs", "copy-protected.pdf")
|
||||
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
|
||||
pdf.partition_pdf(filename=filename, strategy="hi_res", pdf_image_dpi=100)
|
||||
mock_process.assert_called_once_with(
|
||||
filename,
|
||||
is_image=False,
|
||||
extract_tables=mock.ANY,
|
||||
model_name=pdf.default_hi_res_model(),
|
||||
pdf_image_dpi=100,
|
||||
extract_images_in_pdf=mock.ANY,
|
||||
image_output_dir_path=mock.ANY,
|
||||
)
|
||||
assert mock_process.call_args[1]["pdf_image_dpi"] == 100
|
||||
|
||||
|
||||
def test_partition_pdf_requiring_recursive_text_grab(filename="example-docs/reliance.pdf"):
|
||||
|
@ -736,7 +736,9 @@ def test_auto_partition_xlsx_from_filename(filename="example-docs/stanley-cups.x
|
||||
],
|
||||
)
|
||||
def test_auto_partition_respects_skip_infer_table_types(
|
||||
skip_infer_table_types, filename, has_text_as_html_field
|
||||
skip_infer_table_types,
|
||||
filename,
|
||||
has_text_as_html_field,
|
||||
):
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, filename)
|
||||
with open(filename, "rb") as f:
|
||||
|
@ -46,7 +46,8 @@
|
||||
]
|
||||
},
|
||||
"filetype": "image/jpeg",
|
||||
"page_number": 1
|
||||
"page_number": 1,
|
||||
"text_as_html": "<table><thead><th>Dataset</th><th>| Base Model\"</th><th>Large Model</th><th>| Notes</th></thead><tr><td>PubLayNet [38]</td><td>P/M</td><td>M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3)</td><td>M</td><td>-</td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper [17]</td><td>P</td><td>-</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>‘TableBank (18)</td><td>P</td><td>P</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset (31)</td><td>| F/M</td><td>-</td><td>Layouts of history Japanese documents</td></tr></table>"
|
||||
},
|
||||
"text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century ‘TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents"
|
||||
},
|
||||
|
@ -22,7 +22,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--num-processes "$max_processes" \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--skip-infer-table-types "jpg" \
|
||||
--skip-infer-table-types "xls,xlsx" \
|
||||
--pdf-infer-table-structure true \
|
||||
--strategy hi_res \
|
||||
--verbose \
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.10.26-dev3" # pragma: no cover
|
||||
__version__ = "0.10.26-dev4" # pragma: no cover
|
||||
|
@ -306,7 +306,10 @@ def get_table_tokens_per_element(
|
||||
# where TABLE_TOKEN will be a data class defined in unstructured-inference
|
||||
table_tokens = []
|
||||
for ocr_region in ocr_layout:
|
||||
if ocr_region.bbox.is_in(table_element.bbox):
|
||||
if ocr_region.bbox.is_in(
|
||||
table_element.bbox,
|
||||
error_margin=env_config.TABLE_TOKEN_ERROR_MARGIN,
|
||||
):
|
||||
table_tokens.append(
|
||||
{
|
||||
"bbox": [
|
||||
@ -330,7 +333,6 @@ def get_table_tokens_per_element(
|
||||
token["line_num"] = 0
|
||||
if "block_num" not in token:
|
||||
token["block_num"] = 0
|
||||
|
||||
return table_tokens
|
||||
|
||||
|
||||
|
@ -380,7 +380,6 @@ def _partition_pdf_or_image_local(
|
||||
out_layout = process_file_with_model(
|
||||
filename,
|
||||
is_image=is_image,
|
||||
extract_tables=infer_table_structure,
|
||||
model_name=model_name,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
@ -403,7 +402,6 @@ def _partition_pdf_or_image_local(
|
||||
out_layout = process_data_with_model(
|
||||
file,
|
||||
is_image=is_image,
|
||||
extract_tables=infer_table_structure,
|
||||
model_name=model_name,
|
||||
pdf_image_dpi=pdf_image_dpi,
|
||||
extract_images_in_pdf=extract_images_in_pdf,
|
||||
|
@ -61,5 +61,12 @@ class ENVConfig:
|
||||
"""optimum text height for tesseract OCR"""
|
||||
return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
|
||||
|
||||
@property
|
||||
def TABLE_TOKEN_ERROR_MARGIN(self) -> float:
|
||||
"""error margin when comparing if a ocr region is within the table element when perparing
|
||||
table tokens
|
||||
"""
|
||||
return self._get_float("TABLE_TOKEN_ERROR_MARGIN", 0.0)
|
||||
|
||||
|
||||
env_config = ENVConfig()
|
||||
|
Loading…
x
Reference in New Issue
Block a user