mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
chore: add metric helper for table structure eval (#1877)
- add helper to run inference over an image or pdf of table and compare it against a ground truth csv file - this metric generates a similarity score between 1 and 0, where 1 is perfect match and 0 is no match at all - add example docs for testing - NOTE: this metric is only relevant to table structure detection. Therefore the input should be just the table area in an image/pdf file; we are not evaluating table element detection in this metric
This commit is contained in:
parent
b1534af55c
commit
42f8cf1997
@ -3,6 +3,7 @@
|
||||
### Enhancements
|
||||
|
||||
* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
|
||||
* **Add table structure evaluation helpers** Adds functions to evaluate the similarity between predicted table structure and actual table structure.
|
||||
|
||||
### Features
|
||||
|
||||
|
6
example-docs/table-multi-row-column-cells-actual.csv
Normal file
6
example-docs/table-multi-row-column-cells-actual.csv
Normal file
@ -0,0 +1,6 @@
|
||||
Disability Category,Participants,Ballots Completed,Ballots Incomplete/Terminated,Results,
|
||||
,,,,Accuracy,Time to complete
|
||||
Blind,5,1,4,"34.5%, n=1","1199 sec, n=1"
|
||||
Low Vision,5,2,3,"98.3% n=2 (97.7%, n=3)","1716 sec, n=3 (1934 sec, n=2)"
|
||||
Dexterity,5,4,1,"98.3%, n=4","1672.1 sec, n=4"
|
||||
Mobility,3,3,0,"95.4%, n=3","1416 sec, n=3"
|
|
BIN
example-docs/table-multi-row-column-cells.pdf
Normal file
BIN
example-docs/table-multi-row-column-cells.pdf
Normal file
Binary file not shown.
BIN
example-docs/table-multi-row-column-cells.png
Normal file
BIN
example-docs/table-multi-row-column-cells.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 78 KiB |
27
test_unstructured/metrics/test_table_structure.py
Normal file
27
test_unstructured/metrics/test_table_structure.py
Normal file
@ -0,0 +1,27 @@
|
||||
import pytest
|
||||
|
||||
from unstructured.metrics.table_structure import (
|
||||
eval_table_transformer_for_file,
|
||||
image_or_pdf_to_dataframe,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
"example-docs/table-multi-row-column-cells.png",
|
||||
"example-docs/table-multi-row-column-cells.pdf",
|
||||
],
|
||||
)
|
||||
def test_image_or_pdf_to_dataframe(filename):
|
||||
df = image_or_pdf_to_dataframe(filename)
|
||||
assert ["Blind", "5", "1", "4", "34.5%, n=1", "1199 sec, n=1"] in df.values
|
||||
|
||||
|
||||
def test_eval_table_transformer_for_file():
|
||||
score = eval_table_transformer_for_file(
|
||||
"example-docs/table-multi-row-column-cells.png",
|
||||
"example-docs/table-multi-row-column-cells-actual.csv",
|
||||
)
|
||||
# avoid severe degradation of performance
|
||||
assert 0.8 < score < 1
|
42
unstructured/metrics/table_structure.py
Normal file
42
unstructured/metrics/table_structure.py
Normal file
@ -0,0 +1,42 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from PIL import Image
|
||||
|
||||
from unstructured.partition.pdf import convert_pdf_to_images
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
|
||||
@requires_dependencies("unstructured_inference")
|
||||
def image_or_pdf_to_dataframe(filename: str) -> pd.DataFrame:
|
||||
"""helper to JUST run table transformer on the input image/pdf file. It assumes the input is
|
||||
JUST a table. This is intended to facilitate metric tracking on table structure detection ALONE
|
||||
without mixing metric of element detection model"""
|
||||
from unstructured_inference.models.tables import load_agent, tables_agent
|
||||
|
||||
load_agent()
|
||||
|
||||
if filename.endswith(".pdf"):
|
||||
image = list(convert_pdf_to_images(filename))[0].convert("RGB")
|
||||
else:
|
||||
image = Image.open(filename).convert("RGB")
|
||||
|
||||
return tables_agent.run_prediction(image, result_format="dataframe")
|
||||
|
||||
|
||||
@requires_dependencies("unstructured_inference")
|
||||
def eval_table_transformer_for_file(
|
||||
filename: str,
|
||||
true_table_filename: str,
|
||||
eval_func: str = "token_ratio",
|
||||
) -> float:
|
||||
"""evaluate the predicted table structure vs. actual table structure by column and row as a
|
||||
number between 0 and 1"""
|
||||
from unstructured_inference.models.eval import compare_contents_as_df
|
||||
|
||||
pred_table = image_or_pdf_to_dataframe(filename).fillna("").replace(np.nan, "")
|
||||
actual_table = pd.read_csv(true_table_filename).astype(str).fillna("").replace(np.nan, "")
|
||||
|
||||
results = np.array(
|
||||
list(compare_contents_as_df(actual_table, pred_table, eval_func=eval_func).values()),
|
||||
)
|
||||
return results.mean() / 100.0
|
Loading…
x
Reference in New Issue
Block a user