chore: add metric helper for table structure eval (#1877)

- add helper to run inference over an image or pdf of table and compare it against a ground truth csv file - this metric generates a similarity score between 1 and 0, where 1 is perfect match and 0 is no match at all - add example docs for testing - NOTE: this metric is only relevant to table structure detection. Therefore the input should be just the table area in an image/pdf file; we are not evaluating table element detection in this metric
2025-06-27 02:30:08 +00:00 · 2023-10-27 13:23:44 -05:00 · 2023-10-27 13:23:44 -05:00 · 42f8cf1997
commit 42f8cf1997
parent b1534af55c
6 changed files with 76 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,7 @@
 ### Enhancements
 * **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
 * **Add table structure evaluation helpers** Adds functions to evaluate the similarity between predicted table structure and actual table structure.
 ### Features
--- a/example-docs/table-multi-row-column-cells-actual.csv
+++ b/example-docs/table-multi-row-column-cells-actual.csv
@ -0,0 +1,6 @@
 Disability Category,Participants,Ballots Completed,Ballots Incomplete/Terminated,Results,
 ,,,,Accuracy,Time to complete
 Blind,5,1,4,"34.5%, n=1","1199 sec, n=1"
 Low Vision,5,2,3,"98.3% n=2 (97.7%, n=3)","1716 sec, n=3 (1934 sec, n=2)"
 Dexterity,5,4,1,"98.3%, n=4","1672.1 sec, n=4"
 Mobility,3,3,0,"95.4%, n=3","1416 sec, n=3"
--- a/example-docs/table-multi-row-column-cells.pdf
+++ b/example-docs/table-multi-row-column-cells.pdf
--- a/example-docs/table-multi-row-column-cells.png
+++ b/example-docs/table-multi-row-column-cells.png
--- a/test_unstructured/metrics/test_table_structure.py
+++ b/test_unstructured/metrics/test_table_structure.py
@ -0,0 +1,27 @@
 import pytest
 from unstructured.metrics.table_structure import (
    eval_table_transformer_for_file,
    image_or_pdf_to_dataframe,
 )
@pytest.mark.parametrize(
    "filename",
    [
        "example-docs/table-multi-row-column-cells.png",
        "example-docs/table-multi-row-column-cells.pdf",
    ],
 )
 def test_image_or_pdf_to_dataframe(filename):
    df = image_or_pdf_to_dataframe(filename)
    assert ["Blind", "5", "1", "4", "34.5%, n=1", "1199 sec, n=1"] in df.values
 def test_eval_table_transformer_for_file():
    score = eval_table_transformer_for_file(
        "example-docs/table-multi-row-column-cells.png",
        "example-docs/table-multi-row-column-cells-actual.csv",
    )
    # avoid severe degradation of performance
    assert 0.8 < score < 1
--- a/unstructured/metrics/table_structure.py
+++ b/unstructured/metrics/table_structure.py
@ -0,0 +1,42 @@
 import numpy as np
 import pandas as pd
 from PIL import Image
 from unstructured.partition.pdf import convert_pdf_to_images
 from unstructured.utils import requires_dependencies
@requires_dependencies("unstructured_inference")
 def image_or_pdf_to_dataframe(filename: str) -> pd.DataFrame:
    """helper to JUST run table transformer on the input image/pdf file. It assumes the input is
    JUST a table. This is intended to facilitate metric tracking on table structure detection ALONE
    without mixing metric of element detection model"""
    from unstructured_inference.models.tables import load_agent, tables_agent
    load_agent()
    if filename.endswith(".pdf"):
        image = list(convert_pdf_to_images(filename))[0].convert("RGB")
    else:
        image = Image.open(filename).convert("RGB")
    return tables_agent.run_prediction(image, result_format="dataframe")
@requires_dependencies("unstructured_inference")
 def eval_table_transformer_for_file(
    filename: str,
    true_table_filename: str,
    eval_func: str = "token_ratio",
 ) -> float:
    """evaluate the predicted table structure vs. actual table structure by column and row as a
    number between 0 and 1"""
    from unstructured_inference.models.eval import compare_contents_as_df
    pred_table = image_or_pdf_to_dataframe(filename).fillna("").replace(np.nan, "")
    actual_table = pd.read_csv(true_table_filename).astype(str).fillna("").replace(np.nan, "")
    results = np.array(
        list(compare_contents_as_df(actual_table, pred_table, eval_func=eval_func).values()),
    )
    return results.mean() / 100.0