From 1b70ea86b3410d79f4cf1518c2b289002e4ce11f Mon Sep 17 00:00:00 2001 From: Yao You Date: Wed, 3 Jan 2024 13:41:51 -0600 Subject: [PATCH] fix: update table structure eval to use new table inference interface (#2306) Provide OCR tokens for table eval script. Right now `unstructured-inference` can compute OCR components when they are not passed in but in a future release we will be required to pass in OCR results into table structure extraction model: https://github.com/Unstructured-IO/unstructured-inference/blob/d3b298131352b9676bf7d65de757c85561491e31/CHANGELOG.md#0719 This PR prepares for the upcoming change by passing ocr token into table structure extraction process. ## test Create a new virtual env that follows the setup in readme then upgrade `inference` with `pip install unstructured-inference --upgrade`. Run test `PYTHONPATH=. pytest test_unstructured/metrics/test_table_structure.py` would fail on main branch but fixed in this PR. --------- Co-authored-by: Austin Walker --- CHANGELOG.md | 1 + unstructured/metrics/table_structure.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a7f2539d..834055102 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ ### Fixes +* **Fix table structure metric script** Update the call to table agent to now provide OCR tokens as required * **Fix element extraction not working when using "auto" strategy for pdf and image** If element extraction is specified, the "auto" strategy falls back to the "hi_res" strategy. ## 0.11.6 diff --git a/unstructured/metrics/table_structure.py b/unstructured/metrics/table_structure.py index ff79114f9..53f9171ad 100644 --- a/unstructured/metrics/table_structure.py +++ b/unstructured/metrics/table_structure.py @@ -3,6 +3,7 @@ import pandas as pd from PIL import Image from unstructured.partition.pdf import convert_pdf_to_images +from unstructured.partition.pdf_image.ocr import get_table_tokens from unstructured.utils import requires_dependencies @@ -20,7 +21,9 @@ def image_or_pdf_to_dataframe(filename: str) -> pd.DataFrame: else: image = Image.open(filename).convert("RGB") - return tables_agent.run_prediction(image, result_format="dataframe") + return tables_agent.run_prediction( + image, ocr_tokens=get_table_tokens(image), result_format="dataframe" + ) @requires_dependencies("unstructured_inference")