mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 13:44:05 +00:00
refactor: measure_text_edit_distance function for aggregation (#2108)
- Refactor `metrics/evaluation.py` to accepts `grouping` as parameter. - Switch to `DataFrame` for easier analysis and aggregation.
This commit is contained in:
parent
d7456ab6d2
commit
2c2d5b65ca
@ -20,7 +20,7 @@
|
||||
### Features
|
||||
|
||||
* **Add ad-hoc fields to ElementMetadata instance.** End-users can now add their own metadata fields simply by assigning to an element-metadata attribute-name of their choice, like `element.metadata.coefficient = 0.58`. These fields will round-trip through JSON and can be accessed with dotted notation.
|
||||
* **MongoDB Destination Connector** New destination connector added to all CLI ingest commands to support writing partitioned json output to mongodb.
|
||||
* **MongoDB Destination Connector.** New destination connector added to all CLI ingest commands to support writing partitioned json output to mongodb.
|
||||
|
||||
### Fixes
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from unstructured.metrics.evaluate import (
|
||||
@ -34,3 +35,15 @@ def test_text_extraction_takes_list():
|
||||
with open(os.path.join(export_dir, "all-docs-cct.tsv")) as f:
|
||||
lines = f.read().splitlines()
|
||||
assert len(lines) == len(output_list) + 1 # includes header
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_text_extraction_grouping():
|
||||
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
||||
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
||||
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
||||
measure_text_edit_distance(
|
||||
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype"
|
||||
)
|
||||
df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
|
||||
assert len(df) == 4
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
# Environment Variables:
|
||||
# - OVERWRITE_FIXTURES: Controls whether to overwrite fixtures or not. default: "false"
|
||||
|
||||
set -e
|
||||
set +e
|
||||
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||
@ -45,10 +45,9 @@ if [ "$OVERWRITE_FIXTURES" != "false" ]; then
|
||||
# force copy (overwrite) files from metrics-tmp (new eval metrics) to metrics (old eval metrics)
|
||||
mkdir -p "$METRICS_DIR"
|
||||
cp -rf "$TMP_METRICS_LATEST_RUN_DIR" "$OUTPUT_ROOT/metrics"
|
||||
elif ! diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR" ; then
|
||||
elif ! diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR" ; then
|
||||
"$SCRIPT_DIR"/clean-permissions-files.sh "$TMP_METRICS_LATEST_RUN_DIR"
|
||||
diff -r "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR"> metricsdiff.txt
|
||||
cat metricsdiff.txt
|
||||
diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR"> metricsdiff.txt
|
||||
diffstat -c metricsdiff.txt
|
||||
echo
|
||||
echo "There are differences from the previously checked-in structured outputs."
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
strategy average sample_sd population_sd count
|
||||
element-type-accuracy 0.814 0.108 0.077 2
|
||||
metric average sample_sd population_sd count
|
||||
element-type-accuracy 0.814 0.108 0.077 2
|
||||
|
||||
|
@ -1,3 +1,3 @@
|
||||
filename doctype connector element-type-accuracy
|
||||
IRS-form-1987.pdf pdf azure 0.89
|
||||
page-with-formula.pdf pdf s3 0.737
|
||||
filename doctype connector element-type-accuracy
|
||||
IRS-form-1987.pdf pdf azure 0.89
|
||||
page-with-formula.pdf pdf s3 0.737
|
||||
|
||||
|
@ -1,3 +1,3 @@
|
||||
strategy average sample_sd population_sd count
|
||||
cct-accuracy 0.792 0.253 0.245 15
|
||||
cct-%missing 0.025 0.034 0.033 15
|
||||
metric average sample_sd population_sd count
|
||||
cct-accuracy 0.803 0.249 0.241 16
|
||||
cct-%missing 0.024 0.033 0.032 16
|
||||
|
||||
|
@ -1,16 +1,17 @@
|
||||
filename doctype connector cct-accuracy cct-%missing
|
||||
Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf pdf azure 0.981 0.007
|
||||
IRS-form-1987.pdf pdf azure 0.783 0.135
|
||||
spring-weather.html html azure 0.0 0.018
|
||||
fake-text.txt txt Sharepoint 1.0 0.0
|
||||
stanley-cups.xlsx xlsx Sharepoint 0.778 0.0
|
||||
ideas-page.html html Sharepoint 0.929 0.033
|
||||
UDHR_first_article_all.txt txt local-single-file 0.995 0.0
|
||||
example-10k.html html local 0.686 0.037
|
||||
ideas-page.html html local 0.929 0.033
|
||||
fake-html-cp1252.html html local 0.659 0.0
|
||||
fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0
|
||||
layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.945 0.029
|
||||
layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032
|
||||
2023-Jan-economic-outlook.pdf pdf s3 0.846 0.039
|
||||
recalibrating-risk-report.pdf pdf s3 0.973 0.007
|
||||
filename doctype connector cct-accuracy cct-%missing
|
||||
fake-text.txt txt Sharepoint 1.0 0.0
|
||||
ideas-page.html html Sharepoint 0.929 0.033
|
||||
stanley-cups.xlsx xlsx Sharepoint 0.778 0.0
|
||||
Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf pdf azure 0.981 0.007
|
||||
IRS-form-1987.pdf pdf azure 0.783 0.135
|
||||
spring-weather.html html azure 0.0 0.018
|
||||
example-10k.html html local 0.686 0.037
|
||||
fake-html-cp1252.html html local 0.659 0.0
|
||||
ideas-page.html html local 0.929 0.033
|
||||
UDHR_first_article_all.txt txt local-single-file 0.995 0.0
|
||||
fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0
|
||||
layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032
|
||||
layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.945 0.029
|
||||
2023-Jan-economic-outlook.pdf pdf s3 0.846 0.039
|
||||
page-with-formula.pdf pdf s3 0.971 0.021
|
||||
recalibrating-risk-report.pdf pdf s3 0.973 0.007
|
||||
|
||||
|
@ -36,6 +36,7 @@ def main():
|
||||
help="Directory to save the output evaluation metrics to. Default to \
|
||||
your/working/dir/metrics/",
|
||||
)
|
||||
@click.option("--grouping", type=str, help="Input field for aggregration, or leave blank if none.")
|
||||
@click.option(
|
||||
"--weights",
|
||||
type=(int, int, int),
|
||||
@ -50,10 +51,11 @@ def measure_text_edit_distance_command(
|
||||
output_list: Optional[List[str]],
|
||||
source_list: Optional[List[str]],
|
||||
export_dir: str,
|
||||
grouping: Optional[str],
|
||||
weights: Tuple[int, int, int],
|
||||
):
|
||||
return measure_text_edit_distance(
|
||||
output_dir, source_dir, output_list, source_list, export_dir, weights
|
||||
output_dir, source_dir, output_list, source_list, export_dir, grouping, weights
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -7,6 +7,11 @@ def get_element_type_frequency(
|
||||
) -> Union[Dict[Tuple[str, Optional[int]], int], Dict]:
|
||||
"""
|
||||
Calculate the frequency of Element Types from a list of elements.
|
||||
|
||||
Args:
|
||||
elements (str): String-formatted json of all elements (as a result of elements_to_json).
|
||||
Returns:
|
||||
Element type and its frequency in dictionary format.
|
||||
"""
|
||||
frequency: Dict = {}
|
||||
if len(elements) == 0:
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
#! /usr/bin/env python3
|
||||
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import statistics
|
||||
import sys
|
||||
from typing import Any, List, Optional, Tuple
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import click
|
||||
import pandas as pd
|
||||
|
||||
from unstructured.metrics.element_type import (
|
||||
calculate_element_type_percent_match,
|
||||
@ -29,7 +29,7 @@ if "ingest_log_handler" not in [h.name for h in logger.handlers]:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
agg_headers = ["strategy", "average", "sample_sd", "population_sd", "count"]
|
||||
agg_headers = ["metric", "average", "sample_sd", "population_sd", "count"]
|
||||
|
||||
|
||||
def measure_text_edit_distance(
|
||||
@ -38,6 +38,7 @@ def measure_text_edit_distance(
|
||||
output_list: Optional[List[str]] = None,
|
||||
source_list: Optional[List[str]] = None,
|
||||
export_dir: str = "metrics",
|
||||
grouping: Optional[str] = None,
|
||||
weights: Tuple[int, int, int] = (2, 1, 1),
|
||||
) -> None:
|
||||
"""
|
||||
@ -58,50 +59,57 @@ def measure_text_edit_distance(
|
||||
sys.exit(0)
|
||||
|
||||
rows = []
|
||||
accuracy_scores: List[float] = []
|
||||
percent_missing_scores: List[float] = []
|
||||
|
||||
# assumption: output file name convention is name-of-file.doc.json
|
||||
for doc in output_list: # type: ignore
|
||||
fn = (doc.split("/")[-1]).split(".json")[0]
|
||||
doctype = fn.rsplit(".", 1)[-1]
|
||||
fn_txt = fn + ".txt"
|
||||
filename = (doc.split("/")[-1]).split(".json")[0]
|
||||
doctype = filename.rsplit(".", 1)[-1]
|
||||
fn_txt = filename + ".txt"
|
||||
connector = doc.split("/")[0]
|
||||
|
||||
# not all odetta cct files follow the same naming convention;
|
||||
# some exclude the original filetype from the name
|
||||
if fn_txt not in source_list:
|
||||
fn = filename.rsplit(".", 1)[0]
|
||||
fn_txt = fn + ".txt"
|
||||
|
||||
if fn_txt in source_list: # type: ignore
|
||||
output_cct = elements_to_text(elements_from_json(os.path.join(output_dir, doc)))
|
||||
source_cct = _read_text(os.path.join(source_dir, fn_txt))
|
||||
accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3)
|
||||
percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3)
|
||||
|
||||
rows.append([fn, doctype, connector, accuracy, percent_missing])
|
||||
accuracy_scores.append(accuracy)
|
||||
percent_missing_scores.append(percent_missing)
|
||||
rows.append([filename, doctype, connector, accuracy, percent_missing])
|
||||
|
||||
headers = ["filename", "doctype", "connector", "cct-accuracy", "cct-%missing"]
|
||||
_write_to_file(export_dir, "all-docs-cct.tsv", rows, headers)
|
||||
df = pd.DataFrame(rows, columns=headers)
|
||||
export_filename = "all-docs-cct"
|
||||
|
||||
agg_rows = []
|
||||
agg_rows.append(
|
||||
[
|
||||
"cct-accuracy",
|
||||
_mean(accuracy_scores),
|
||||
_stdev(accuracy_scores),
|
||||
_pstdev(accuracy_scores),
|
||||
len(accuracy_scores),
|
||||
],
|
||||
)
|
||||
agg_rows.append(
|
||||
[
|
||||
"cct-%missing",
|
||||
_mean(percent_missing_scores),
|
||||
_stdev(percent_missing_scores),
|
||||
_pstdev(percent_missing_scores),
|
||||
len(percent_missing_scores),
|
||||
],
|
||||
)
|
||||
_write_to_file(export_dir, "aggregate-scores-cct.tsv", agg_rows, agg_headers)
|
||||
_display(agg_rows, agg_headers)
|
||||
acc = df[["cct-accuracy"]].agg([_mean, _stdev, _pstdev, "count"]).transpose()
|
||||
miss = df[["cct-%missing"]].agg([_mean, _stdev, _pstdev, "count"]).transpose()
|
||||
agg_df = pd.concat((acc, miss)).reset_index()
|
||||
agg_df.columns = agg_headers
|
||||
|
||||
if grouping:
|
||||
if grouping in ["doctype", "connector"]:
|
||||
grouped_acc = (
|
||||
df.groupby(grouping)
|
||||
.agg({"cct-accuracy": [_mean, _stdev, "count"]})
|
||||
.rename(columns={"_mean": "mean", "_stdev": "stdev"})
|
||||
)
|
||||
grouped_miss = (
|
||||
df.groupby(grouping)
|
||||
.agg({"cct-%missing": [_mean, _stdev, "count"]})
|
||||
.rename(columns={"_mean": "mean", "_stdev": "stdev"})
|
||||
)
|
||||
df = _format_grouping_output(grouped_acc, grouped_miss)
|
||||
export_filename = f"all-{grouping}-agg-cct"
|
||||
else:
|
||||
print("No field to group by. Returning a non-group evaluation.")
|
||||
|
||||
_write_to_file(export_dir, f"{export_filename}.tsv", df)
|
||||
_write_to_file(export_dir, "aggregate-scores-cct.tsv", agg_df)
|
||||
_display(agg_df)
|
||||
|
||||
|
||||
def measure_element_type_accuracy(
|
||||
@ -124,40 +132,31 @@ def measure_element_type_accuracy(
|
||||
if not source_list:
|
||||
source_list = _listdir_recursive(source_dir)
|
||||
|
||||
if not output_list:
|
||||
print("No output files to calculate to element type for, exiting")
|
||||
sys.exit(0)
|
||||
|
||||
rows = []
|
||||
accuracy_scores: List[float] = []
|
||||
|
||||
for doc in output_list: # type: ignore
|
||||
fn = (doc.split("/")[-1]).split(".json")[0]
|
||||
doctype = fn.rsplit(".", 1)[-1]
|
||||
fn_json = fn + ".json"
|
||||
filename = (doc.split("/")[-1]).split(".json")[0]
|
||||
doctype = filename.rsplit(".", 1)[-1]
|
||||
fn_json = filename + ".json"
|
||||
connector = doc.split("/")[0]
|
||||
if fn_json in source_list: # type: ignore
|
||||
output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc)))
|
||||
source = get_element_type_frequency(_read_text(os.path.join(source_dir, fn_json)))
|
||||
accuracy = round(calculate_element_type_percent_match(output, source), 3)
|
||||
rows.append([fn, doctype, connector, accuracy])
|
||||
accuracy_scores.append(accuracy)
|
||||
rows.append([filename, doctype, connector, accuracy])
|
||||
|
||||
headers = ["filename", "doctype", "connector", "element-type-accuracy"]
|
||||
_write_to_file(export_dir, "all-docs-element-type-frequency.tsv", rows, headers)
|
||||
df = pd.DataFrame(rows, columns=headers)
|
||||
if df.empty:
|
||||
agg_df = pd.DataFrame(["element-type-accuracy", None, None, None, 0]).transpose()
|
||||
else:
|
||||
agg_df = df.agg({"element-type-accuracy": [_mean, _stdev, _pstdev, "count"]}).transpose()
|
||||
agg_df = agg_df.reset_index()
|
||||
agg_df.columns = agg_headers
|
||||
|
||||
agg_rows = []
|
||||
agg_rows.append(
|
||||
[
|
||||
"element-type-accuracy",
|
||||
_mean(accuracy_scores),
|
||||
_stdev(accuracy_scores),
|
||||
_pstdev(accuracy_scores),
|
||||
len(accuracy_scores),
|
||||
],
|
||||
)
|
||||
_write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_rows, agg_headers)
|
||||
_display(agg_rows, agg_headers)
|
||||
_write_to_file(export_dir, "all-docs-element-type-frequency.tsv", df)
|
||||
_write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_df)
|
||||
_display(agg_df)
|
||||
|
||||
|
||||
def _listdir_recursive(dir: str):
|
||||
@ -173,13 +172,20 @@ def _listdir_recursive(dir: str):
|
||||
return listdir
|
||||
|
||||
|
||||
def _display(rows, headers):
|
||||
def _format_grouping_output(*df):
|
||||
return pd.concat(df, axis=1).reset_index()
|
||||
|
||||
|
||||
def _display(df):
|
||||
if len(df) == 0:
|
||||
return
|
||||
headers = df.columns.tolist()
|
||||
col_widths = [
|
||||
max(len(headers[i]), max(len(str(row[i])) for row in rows)) for i in range(len(headers))
|
||||
max(len(header), max(len(str(item)) for item in df[header])) for header in headers
|
||||
]
|
||||
click.echo(" ".join(headers[i].ljust(col_widths[i]) for i in range(len(headers))))
|
||||
click.echo(" ".join(header.ljust(col_widths[i]) for i, header in enumerate(headers)))
|
||||
click.echo("-" * sum(col_widths) + "-" * (len(headers) - 1))
|
||||
for row in rows:
|
||||
for _, row in df.iterrows():
|
||||
formatted_row = []
|
||||
for item in row:
|
||||
if isinstance(item, float):
|
||||
@ -191,31 +197,31 @@ def _display(rows, headers):
|
||||
)
|
||||
|
||||
|
||||
def _write_to_file(dir: str, filename: str, rows: List[Any], headers: List[Any], mode: str = "w"):
|
||||
def _write_to_file(dir: str, filename: str, df: pd.DataFrame, mode: str = "w"):
|
||||
if mode not in ["w", "a"]:
|
||||
raise ValueError("Mode not supported. Mode must be one of [w, a].")
|
||||
if dir and not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
with open(os.path.join(os.path.join(dir, filename)), mode, newline="") as tsv:
|
||||
writer = csv.writer(tsv, delimiter="\t")
|
||||
if mode == "w":
|
||||
writer.writerow(headers)
|
||||
writer.writerows(rows)
|
||||
if "count" in df.columns:
|
||||
df["count"] = df["count"].astype(int)
|
||||
if "filename" in df.columns and "connector" in df.columns:
|
||||
df.sort_values(by=["connector", "filename"], inplace=True)
|
||||
df.to_csv(os.path.join(dir, filename), sep="\t", mode=mode, index=False, header=(mode == "w"))
|
||||
|
||||
|
||||
def _mean(scores: List[float], rounding: Optional[int] = 3):
|
||||
if len(scores) < 1:
|
||||
def _mean(scores: Union[pd.Series, List[float]], rounding: Optional[int] = 3):
|
||||
if len(scores) == 0:
|
||||
return None
|
||||
elif len(scores) == 1:
|
||||
mean = scores[0]
|
||||
else:
|
||||
mean = statistics.mean(scores)
|
||||
mean = statistics.mean(scores)
|
||||
if not rounding:
|
||||
return mean
|
||||
return round(mean, rounding)
|
||||
|
||||
|
||||
def _stdev(scores: List[float], rounding: Optional[int] = 3):
|
||||
def _stdev(scores: List[Optional[float]], rounding: Optional[int] = 3):
|
||||
# Filter out None values
|
||||
scores = [score for score in scores if score is not None]
|
||||
# Proceed only if there are more than one value
|
||||
if len(scores) <= 1:
|
||||
return None
|
||||
if not rounding:
|
||||
@ -223,7 +229,8 @@ def _stdev(scores: List[float], rounding: Optional[int] = 3):
|
||||
return round(statistics.stdev(scores), rounding)
|
||||
|
||||
|
||||
def _pstdev(scores: List[float], rounding: Optional[int] = 3):
|
||||
def _pstdev(scores: List[Optional[float]], rounding: Optional[int] = 3):
|
||||
scores = [score for score in scores if score is not None]
|
||||
if len(scores) <= 1:
|
||||
return None
|
||||
if not rounding:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user