mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
chore: add metric helper for table structure eval (#1877)
- add helper to run inference over an image or pdf of table and compare it against a ground truth csv file - this metric generates a similarity score between 1 and 0, where 1 is perfect match and 0 is no match at all - add example docs for testing - NOTE: this metric is only relevant to table structure detection. Therefore the input should be just the table area in an image/pdf file; we are not evaluating table element detection in this metric
This commit is contained in:
parent
b1534af55c
commit
42f8cf1997
@ -3,6 +3,7 @@
|
|||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
|
* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
|
||||||
|
* **Add table structure evaluation helpers** Adds functions to evaluate the similarity between predicted table structure and actual table structure.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
6
example-docs/table-multi-row-column-cells-actual.csv
Normal file
6
example-docs/table-multi-row-column-cells-actual.csv
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
Disability Category,Participants,Ballots Completed,Ballots Incomplete/Terminated,Results,
|
||||||
|
,,,,Accuracy,Time to complete
|
||||||
|
Blind,5,1,4,"34.5%, n=1","1199 sec, n=1"
|
||||||
|
Low Vision,5,2,3,"98.3% n=2 (97.7%, n=3)","1716 sec, n=3 (1934 sec, n=2)"
|
||||||
|
Dexterity,5,4,1,"98.3%, n=4","1672.1 sec, n=4"
|
||||||
|
Mobility,3,3,0,"95.4%, n=3","1416 sec, n=3"
|
|
BIN
example-docs/table-multi-row-column-cells.pdf
Normal file
BIN
example-docs/table-multi-row-column-cells.pdf
Normal file
Binary file not shown.
BIN
example-docs/table-multi-row-column-cells.png
Normal file
BIN
example-docs/table-multi-row-column-cells.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 78 KiB |
27
test_unstructured/metrics/test_table_structure.py
Normal file
27
test_unstructured/metrics/test_table_structure.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from unstructured.metrics.table_structure import (
|
||||||
|
eval_table_transformer_for_file,
|
||||||
|
image_or_pdf_to_dataframe,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"filename",
|
||||||
|
[
|
||||||
|
"example-docs/table-multi-row-column-cells.png",
|
||||||
|
"example-docs/table-multi-row-column-cells.pdf",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_image_or_pdf_to_dataframe(filename):
|
||||||
|
df = image_or_pdf_to_dataframe(filename)
|
||||||
|
assert ["Blind", "5", "1", "4", "34.5%, n=1", "1199 sec, n=1"] in df.values
|
||||||
|
|
||||||
|
|
||||||
|
def test_eval_table_transformer_for_file():
|
||||||
|
score = eval_table_transformer_for_file(
|
||||||
|
"example-docs/table-multi-row-column-cells.png",
|
||||||
|
"example-docs/table-multi-row-column-cells-actual.csv",
|
||||||
|
)
|
||||||
|
# avoid severe degradation of performance
|
||||||
|
assert 0.8 < score < 1
|
42
unstructured/metrics/table_structure.py
Normal file
42
unstructured/metrics/table_structure.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from unstructured.partition.pdf import convert_pdf_to_images
|
||||||
|
from unstructured.utils import requires_dependencies
|
||||||
|
|
||||||
|
|
||||||
|
@requires_dependencies("unstructured_inference")
|
||||||
|
def image_or_pdf_to_dataframe(filename: str) -> pd.DataFrame:
|
||||||
|
"""helper to JUST run table transformer on the input image/pdf file. It assumes the input is
|
||||||
|
JUST a table. This is intended to facilitate metric tracking on table structure detection ALONE
|
||||||
|
without mixing metric of element detection model"""
|
||||||
|
from unstructured_inference.models.tables import load_agent, tables_agent
|
||||||
|
|
||||||
|
load_agent()
|
||||||
|
|
||||||
|
if filename.endswith(".pdf"):
|
||||||
|
image = list(convert_pdf_to_images(filename))[0].convert("RGB")
|
||||||
|
else:
|
||||||
|
image = Image.open(filename).convert("RGB")
|
||||||
|
|
||||||
|
return tables_agent.run_prediction(image, result_format="dataframe")
|
||||||
|
|
||||||
|
|
||||||
|
@requires_dependencies("unstructured_inference")
|
||||||
|
def eval_table_transformer_for_file(
|
||||||
|
filename: str,
|
||||||
|
true_table_filename: str,
|
||||||
|
eval_func: str = "token_ratio",
|
||||||
|
) -> float:
|
||||||
|
"""evaluate the predicted table structure vs. actual table structure by column and row as a
|
||||||
|
number between 0 and 1"""
|
||||||
|
from unstructured_inference.models.eval import compare_contents_as_df
|
||||||
|
|
||||||
|
pred_table = image_or_pdf_to_dataframe(filename).fillna("").replace(np.nan, "")
|
||||||
|
actual_table = pd.read_csv(true_table_filename).astype(str).fillna("").replace(np.nan, "")
|
||||||
|
|
||||||
|
results = np.array(
|
||||||
|
list(compare_contents_as_df(actual_table, pred_table, eval_func=eval_func).values()),
|
||||||
|
)
|
||||||
|
return results.mean() / 100.0
|
Loading…
x
Reference in New Issue
Block a user