mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 02:53:31 +00:00
feat: separate evaluate grouping function (#2572)
Separate the aggregating functionality of `text_extraction_accuracy` to a stand-alone function to avoid duplicated eval effort if the granular level eval is already available. To test: Run `PYTHONPATH=. pytest test_unstructured/metrics/test_evaluate.py` locally
This commit is contained in:
parent
d3242fb546
commit
daaf1775b4
@ -10,7 +10,7 @@
|
||||
### Fixes
|
||||
|
||||
* **Add OctoAI embedder** Adds support for embeddings via OctoAI.
|
||||
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors **
|
||||
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors**
|
||||
* **Fix don't treat plain text files with double quotes as JSON ** If a file can be deserialized as JSON but it deserializes as a string, treat it as plain text even though it's valid JSON.
|
||||
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors **
|
||||
* **Fix cluster of bugs in `partition_xlsx()` that dropped content.** Algorithm for detecting "subtables" within a worksheet dropped table elements for certain patterns of populated cells such as when a trailing single-cell row appeared in a contiguous block of populated cells.
|
||||
|
||||
@ -6,6 +6,7 @@ import pandas as pd
|
||||
import pytest
|
||||
|
||||
from unstructured.metrics.evaluate import (
|
||||
group_text_extraction_accuracy,
|
||||
measure_element_type_accuracy,
|
||||
measure_table_structure_accuracy,
|
||||
measure_text_extraction_accuracy,
|
||||
@ -25,6 +26,20 @@ GOLD_TABLE_STRUCTURE_DIRNAME = "gold_standard_table_structure"
|
||||
UNSTRUCTURED_CCT_DIRNAME = "unstructured_output_cct"
|
||||
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME = "unstructured_output_table_structure"
|
||||
|
||||
DUMMY_DF = pd.DataFrame(
|
||||
{
|
||||
"filename": [
|
||||
"Bank Good Credit Loan.pptx",
|
||||
"Performance-Audit-Discussion.pdf",
|
||||
"currency.csv",
|
||||
],
|
||||
"doctype": ["pptx", "pdf", "csv"],
|
||||
"connector": ["connector1", "connector1", "connector2"],
|
||||
"cct-accuracy": [0.812, 0.994, 0.887],
|
||||
"cct-%missing": [0.001, 0.002, 0.041],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def _cleanup_after_test():
|
||||
@ -60,7 +75,7 @@ def test_text_extraction_evaluation():
|
||||
def test_text_extraction_evaluation_type_txt():
|
||||
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_CCT_DIRNAME)
|
||||
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
||||
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct_txt")
|
||||
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
||||
measure_text_extraction_accuracy(
|
||||
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, output_type="txt"
|
||||
)
|
||||
@ -125,7 +140,7 @@ def test_text_extraction_takes_list():
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.usefixtures("_cleanup_after_test")
|
||||
def test_text_extraction_grouping():
|
||||
def test_text_extraction_with_grouping():
|
||||
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
||||
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
||||
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
||||
@ -145,3 +160,63 @@ def test_text_extraction_wrong_type():
|
||||
measure_text_extraction_accuracy(
|
||||
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, output_type="wrong"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.usefixtures("_cleanup_after_test")
|
||||
@pytest.mark.parametrize(("grouping", "count_row"), [("doctype", 3), ("connector", 2)])
|
||||
def test_group_text_extraction_df_input(grouping, count_row):
|
||||
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
||||
group_text_extraction_accuracy(grouping=grouping, data_input=DUMMY_DF, export_dir=export_dir)
|
||||
grouped_df = pd.read_csv(os.path.join(export_dir, f"all-{grouping}-agg-cct.tsv"), sep="\t")
|
||||
assert grouped_df[grouping].dropna().nunique() == count_row
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.usefixtures("_cleanup_after_test")
|
||||
def test_group_text_extraction_tsv_input():
|
||||
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
||||
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
||||
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
||||
measure_text_extraction_accuracy(
|
||||
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
|
||||
)
|
||||
filename = os.path.join(export_dir, "all-docs-cct.tsv")
|
||||
group_text_extraction_accuracy(grouping="doctype", data_input=filename, export_dir=export_dir)
|
||||
grouped_df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
|
||||
assert grouped_df["doctype"].dropna().nunique() == 3
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.usefixtures("_cleanup_after_test")
|
||||
def test_group_text_extraction_invalid_group():
|
||||
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
||||
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
||||
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
||||
measure_text_extraction_accuracy(
|
||||
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
|
||||
)
|
||||
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
|
||||
with pytest.raises(ValueError):
|
||||
group_text_extraction_accuracy(grouping="invalid", data_input=df, export_dir=export_dir)
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_text_extraction_grouping_empty_df():
|
||||
empty_df = pd.DataFrame()
|
||||
with pytest.raises(SystemExit):
|
||||
group_text_extraction_accuracy("doctype", empty_df, "some_dir")
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_group_text_extraction_accuracy_missing_grouping_column():
|
||||
df_with_no_grouping = pd.DataFrame({"some_column": [1, 2, 3]})
|
||||
with pytest.raises(SystemExit):
|
||||
group_text_extraction_accuracy("doctype", df_with_no_grouping, "some_dir")
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_group_text_extraction_accuracy_all_null_grouping_column():
|
||||
df_with_null_grouping = pd.DataFrame({"doctype": [None, None, None]})
|
||||
with pytest.raises(SystemExit):
|
||||
group_text_extraction_accuracy("doctype", df_with_null_grouping, "some_dir")
|
||||
|
||||
@ -5,6 +5,7 @@ from typing import List, Optional, Tuple
|
||||
import click
|
||||
|
||||
from unstructured.metrics.evaluate import (
|
||||
group_text_extraction_accuracy,
|
||||
measure_element_type_accuracy,
|
||||
measure_table_structure_accuracy,
|
||||
measure_text_extraction_accuracy,
|
||||
@ -131,6 +132,30 @@ def measure_element_type_accuracy_command(
|
||||
)
|
||||
|
||||
|
||||
@main.command()
|
||||
@click.option(
|
||||
"--grouping",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The category to group by; valid values are 'doctype' and 'connector'.",
|
||||
)
|
||||
@click.option(
|
||||
"--data_input",
|
||||
type=str,
|
||||
required=True,
|
||||
help="A datafram or path to the CSV/TSV file containing the data",
|
||||
)
|
||||
@click.option(
|
||||
"--export_dir",
|
||||
type=str,
|
||||
default="metrics",
|
||||
help="Directory to save the output evaluation metrics to. Default to \
|
||||
your/working/dir/metrics/",
|
||||
)
|
||||
def group_text_extraction_accuracy_command(grouping: str, data_input: str, export_dir: str):
|
||||
return group_text_extraction_accuracy(grouping, data_input, export_dir)
|
||||
|
||||
|
||||
@main.command()
|
||||
@click.option("--output_dir", type=str, help="Directory to structured output.")
|
||||
@click.option("--source_dir", type=str, help="Directory to structured source.")
|
||||
@ -182,7 +207,3 @@ def measure_table_structure_accuracy_command(
|
||||
return measure_table_structure_accuracy(
|
||||
output_dir, source_dir, output_list, source_list, export_dir, visualize, cutoff
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -4,7 +4,7 @@ import logging
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
@ -16,6 +16,7 @@ from unstructured.metrics.element_type import (
|
||||
from unstructured.metrics.table.table_eval import TableEvalProcessor
|
||||
from unstructured.metrics.text_extraction import calculate_accuracy, calculate_percent_missing_text
|
||||
from unstructured.metrics.utils import (
|
||||
_count,
|
||||
_display,
|
||||
_format_grouping_output,
|
||||
_listdir_recursive,
|
||||
@ -111,32 +112,21 @@ def measure_text_extraction_accuracy(
|
||||
|
||||
headers = ["filename", "doctype", "connector", "cct-accuracy", "cct-%missing"]
|
||||
df = pd.DataFrame(rows, columns=headers)
|
||||
export_filename = "all-docs-cct"
|
||||
|
||||
acc = df[["cct-accuracy"]].agg([_mean, _stdev, _pstdev, "count"]).transpose()
|
||||
miss = df[["cct-%missing"]].agg([_mean, _stdev, _pstdev, "count"]).transpose()
|
||||
agg_df = pd.concat((acc, miss)).reset_index()
|
||||
agg_df.columns = agg_headers
|
||||
acc = df[["cct-accuracy"]].agg([_mean, _stdev, _pstdev, _count]).transpose()
|
||||
miss = df[["cct-%missing"]].agg([_mean, _stdev, _pstdev, _count]).transpose()
|
||||
if acc.shape[1] == 0 and miss.shape[1] == 0:
|
||||
agg_df = pd.DataFrame(columns=agg_headers)
|
||||
else:
|
||||
agg_df = pd.concat((acc, miss)).reset_index()
|
||||
agg_df.columns = agg_headers
|
||||
|
||||
_write_to_file(export_dir, "all-docs-cct.tsv", df)
|
||||
_write_to_file(export_dir, "aggregate-scores-cct.tsv", agg_df)
|
||||
|
||||
if grouping:
|
||||
if grouping in ["doctype", "connector"]:
|
||||
grouped_acc = (
|
||||
df.groupby(grouping)
|
||||
.agg({"cct-accuracy": [_mean, _stdev, "count"]})
|
||||
.rename(columns={"_mean": "mean", "_stdev": "stdev"})
|
||||
)
|
||||
grouped_miss = (
|
||||
df.groupby(grouping)
|
||||
.agg({"cct-%missing": [_mean, _stdev, "count"]})
|
||||
.rename(columns={"_mean": "mean", "_stdev": "stdev"})
|
||||
)
|
||||
df = _format_grouping_output(grouped_acc, grouped_miss)
|
||||
export_filename = f"all-{grouping}-agg-cct"
|
||||
else:
|
||||
print("No field to group by. Returning a non-group evaluation.")
|
||||
group_text_extraction_accuracy(grouping, df, export_dir)
|
||||
|
||||
_write_to_file(export_dir, f"{export_filename}.tsv", df)
|
||||
_write_to_file(export_dir, "aggregate-scores-cct.tsv", agg_df)
|
||||
_display(agg_df)
|
||||
|
||||
|
||||
@ -190,6 +180,48 @@ def measure_element_type_accuracy(
|
||||
_display(agg_df)
|
||||
|
||||
|
||||
def group_text_extraction_accuracy(
|
||||
grouping: str, data_input: Union[pd.DataFrame, str], export_dir: str
|
||||
) -> None:
|
||||
"""Aggregates accuracy and missing metrics by 'doctype' or 'connector', exporting to TSV.
|
||||
|
||||
Args:
|
||||
grouping (str): Grouping category ('doctype' or 'connector').
|
||||
data_input (Union[pd.DataFrame, str]): DataFrame or path to a CSV/TSV file.
|
||||
export_dir (str): Directory for the exported TSV file.
|
||||
"""
|
||||
if grouping not in ("doctype", "connector"):
|
||||
raise ValueError("Invalid grouping category. Returning a non-group evaluation.")
|
||||
if isinstance(data_input, str):
|
||||
if not os.path.exists(data_input):
|
||||
raise FileNotFoundError(f"File {data_input} not found.")
|
||||
if data_input.endswith(".csv"):
|
||||
df = pd.read_csv(data_input)
|
||||
elif data_input.endswith((".tsv", ".txt")):
|
||||
df = pd.read_csv(data_input, sep="\t")
|
||||
else:
|
||||
raise ValueError("Please provide a .csv or .tsv file.")
|
||||
else:
|
||||
df = data_input
|
||||
if df.empty or grouping not in df.columns or df[grouping].isnull().all():
|
||||
raise SystemExit(
|
||||
f"Data cannot be aggregated by `{grouping}`."
|
||||
f" Check if it's empty or the column is missing/empty."
|
||||
)
|
||||
grouped_acc = (
|
||||
df.groupby(grouping)
|
||||
.agg({"cct-accuracy": [_mean, _stdev, "count"]})
|
||||
.rename(columns={"_mean": "mean", "_stdev": "stdev"})
|
||||
)
|
||||
grouped_miss = (
|
||||
df.groupby(grouping)
|
||||
.agg({"cct-%missing": [_mean, _stdev, "count"]})
|
||||
.rename(columns={"_mean": "mean", "_stdev": "stdev"})
|
||||
)
|
||||
grouped_df = _format_grouping_output(grouped_acc, grouped_miss)
|
||||
_write_to_file(export_dir, f"all-{grouping}-agg-cct.tsv", grouped_df)
|
||||
|
||||
|
||||
def measure_table_structure_accuracy(
|
||||
output_dir: str,
|
||||
source_dir: str,
|
||||
|
||||
@ -205,6 +205,13 @@ def _pstdev(scores: List[Optional[float]], rounding: Optional[int] = 3) -> Union
|
||||
return round(statistics.pstdev(scores), rounding)
|
||||
|
||||
|
||||
def _count(scores: List[Optional[float]]) -> float:
|
||||
"""
|
||||
Returns the row count of the list.
|
||||
"""
|
||||
return len(scores)
|
||||
|
||||
|
||||
def _read_text_file(path):
|
||||
"""
|
||||
Reads the contents of a text file and returns it as a string.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user