chore: add metric helper for table structure eval (#1877)

- add helper to run inference over an image or pdf of table and compare it against a ground truth csv file - this metric generates a similarity score between 1 and 0, where 1 is perfect match and 0 is no match at all - add example docs for testing - NOTE: this metric is only relevant to table structure detection. Therefore the input should be just the table area in an image/pdf file; we are not evaluating table element detection in this metric
2025-06-27 02:30:08 +00:00 · 2023-10-27 13:23:44 -05:00 · 2023-10-27 13:23:44 -05:00 · 42f8cf1997
commit 42f8cf1997
parent b1534af55c
6 changed files with 76 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,7 @@
 ### Enhancements

 * **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
+* **Add table structure evaluation helpers** Adds functions to evaluate the similarity between predicted table structure and actual table structure.

 ### Features

--- a/example-docs/table-multi-row-column-cells-actual.csv
+++ b/example-docs/table-multi-row-column-cells-actual.csv
@ -0,0 +1,6 @@
+Disability Category,Participants,Ballots Completed,Ballots Incomplete/Terminated,Results,
+,,,,Accuracy,Time to complete
+Blind,5,1,4,"34.5%, n=1","1199 sec, n=1"
+Low Vision,5,2,3,"98.3% n=2 (97.7%, n=3)","1716 sec, n=3 (1934 sec, n=2)"
+Dexterity,5,4,1,"98.3%, n=4","1672.1 sec, n=4"
+Mobility,3,3,0,"95.4%, n=3","1416 sec, n=3"
--- a/example-docs/table-multi-row-column-cells.pdf
+++ b/example-docs/table-multi-row-column-cells.pdf
--- a/example-docs/table-multi-row-column-cells.png
+++ b/example-docs/table-multi-row-column-cells.png
--- a/test_unstructured/metrics/test_table_structure.py
+++ b/test_unstructured/metrics/test_table_structure.py
@ -0,0 +1,27 @@
+import pytest
+
+from unstructured.metrics.table_structure import (
+    eval_table_transformer_for_file,
+    image_or_pdf_to_dataframe,
+)
+
+
+@pytest.mark.parametrize(
+    "filename",
+    [
+        "example-docs/table-multi-row-column-cells.png",
+        "example-docs/table-multi-row-column-cells.pdf",
+    ],
+)
+def test_image_or_pdf_to_dataframe(filename):
+    df = image_or_pdf_to_dataframe(filename)
+    assert ["Blind", "5", "1", "4", "34.5%, n=1", "1199 sec, n=1"] in df.values
+
+
+def test_eval_table_transformer_for_file():
+    score = eval_table_transformer_for_file(
+        "example-docs/table-multi-row-column-cells.png",
+        "example-docs/table-multi-row-column-cells-actual.csv",
+    )
+    # avoid severe degradation of performance
+    assert 0.8 < score < 1
--- a/unstructured/metrics/table_structure.py
+++ b/unstructured/metrics/table_structure.py
@ -0,0 +1,42 @@
+import numpy as np
+import pandas as pd
+from PIL import Image
+
+from unstructured.partition.pdf import convert_pdf_to_images
+from unstructured.utils import requires_dependencies
+
+
+@requires_dependencies("unstructured_inference")
+def image_or_pdf_to_dataframe(filename: str) -> pd.DataFrame:
+    """helper to JUST run table transformer on the input image/pdf file. It assumes the input is
+    JUST a table. This is intended to facilitate metric tracking on table structure detection ALONE
+    without mixing metric of element detection model"""
+    from unstructured_inference.models.tables import load_agent, tables_agent
+
+    load_agent()
+
+    if filename.endswith(".pdf"):
+        image = list(convert_pdf_to_images(filename))[0].convert("RGB")
+    else:
+        image = Image.open(filename).convert("RGB")
+
+    return tables_agent.run_prediction(image, result_format="dataframe")
+
+
+@requires_dependencies("unstructured_inference")
+def eval_table_transformer_for_file(
+    filename: str,
+    true_table_filename: str,
+    eval_func: str = "token_ratio",
+) -> float:
+    """evaluate the predicted table structure vs. actual table structure by column and row as a
+    number between 0 and 1"""
+    from unstructured_inference.models.eval import compare_contents_as_df
+
+    pred_table = image_or_pdf_to_dataframe(filename).fillna("").replace(np.nan, "")
+    actual_table = pd.read_csv(true_table_filename).astype(str).fillna("").replace(np.nan, "")
+
+    results = np.array(
+        list(compare_contents_as_df(actual_table, pred_table, eval_func=eval_func).values()),
+    )
+    return results.mean() / 100.0