refactor: measure_text_edit_distance function for aggregation (#2108)

- Refactor `metrics/evaluation.py` to accepts `grouping` as parameter. 
- Switch to `DataFrame` for easier analysis and aggregation.
This commit is contained in:
Klaijan 2023-11-22 16:30:16 -05:00 committed by GitHub
parent d7456ab6d2
commit 2c2d5b65ca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 131 additions and 104 deletions

View File

@ -20,7 +20,7 @@
### Features
* **Add ad-hoc fields to ElementMetadata instance.** End-users can now add their own metadata fields simply by assigning to an element-metadata attribute-name of their choice, like `element.metadata.coefficient = 0.58`. These fields will round-trip through JSON and can be accessed with dotted notation.
* **MongoDB Destination Connector** New destination connector added to all CLI ingest commands to support writing partitioned json output to mongodb.
* **MongoDB Destination Connector.** New destination connector added to all CLI ingest commands to support writing partitioned json output to mongodb.
### Fixes

View File

@ -1,6 +1,7 @@
import os
import pathlib
import pandas as pd
import pytest
from unstructured.metrics.evaluate import (
@ -34,3 +35,15 @@ def test_text_extraction_takes_list():
with open(os.path.join(export_dir, "all-docs-cct.tsv")) as f:
lines = f.read().splitlines()
assert len(lines) == len(output_list) + 1 # includes header
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_text_extraction_grouping():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
measure_text_edit_distance(
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype"
)
df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
assert len(df) == 4

View File

@ -8,7 +8,7 @@
# Environment Variables:
# - OVERWRITE_FIXTURES: Controls whether to overwrite fixtures or not. default: "false"
set -e
set +e
SCRIPT_DIR=$(dirname "$(realpath "$0")")
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
@ -45,10 +45,9 @@ if [ "$OVERWRITE_FIXTURES" != "false" ]; then
# force copy (overwrite) files from metrics-tmp (new eval metrics) to metrics (old eval metrics)
mkdir -p "$METRICS_DIR"
cp -rf "$TMP_METRICS_LATEST_RUN_DIR" "$OUTPUT_ROOT/metrics"
elif ! diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR" ; then
elif ! diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR" ; then
"$SCRIPT_DIR"/clean-permissions-files.sh "$TMP_METRICS_LATEST_RUN_DIR"
diff -r "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR"> metricsdiff.txt
cat metricsdiff.txt
diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR"> metricsdiff.txt
diffstat -c metricsdiff.txt
echo
echo "There are differences from the previously checked-in structured outputs."

View File

@ -1,2 +1,2 @@
strategy average sample_sd population_sd count
element-type-accuracy 0.814 0.108 0.077 2
metric average sample_sd population_sd count
element-type-accuracy 0.814 0.108 0.077 2

1 strategy metric average sample_sd population_sd count
2 element-type-accuracy element-type-accuracy 0.814 0.108 0.077 2

View File

@ -1,3 +1,3 @@
filename doctype connector element-type-accuracy
IRS-form-1987.pdf pdf azure 0.89
page-with-formula.pdf pdf s3 0.737
filename doctype connector element-type-accuracy
IRS-form-1987.pdf pdf azure 0.89
page-with-formula.pdf pdf s3 0.737

1 filename doctype connector element-type-accuracy
2 IRS-form-1987.pdf pdf azure 0.89
3 page-with-formula.pdf pdf s3 0.737

View File

@ -1,3 +1,3 @@
strategy average sample_sd population_sd count
cct-accuracy 0.792 0.253 0.245 15
cct-%missing 0.025 0.034 0.033 15
metric average sample_sd population_sd count
cct-accuracy 0.803 0.249 0.241 16
cct-%missing 0.024 0.033 0.032 16

1 strategy metric average sample_sd population_sd count
2 cct-accuracy cct-accuracy 0.792 0.803 0.253 0.249 0.245 0.241 15 16
3 cct-%missing cct-%missing 0.025 0.024 0.034 0.033 0.033 0.032 15 16

View File

@ -1,16 +1,17 @@
filename doctype connector cct-accuracy cct-%missing
Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf pdf azure 0.981 0.007
IRS-form-1987.pdf pdf azure 0.783 0.135
spring-weather.html html azure 0.0 0.018
fake-text.txt txt Sharepoint 1.0 0.0
stanley-cups.xlsx xlsx Sharepoint 0.778 0.0
ideas-page.html html Sharepoint 0.929 0.033
UDHR_first_article_all.txt txt local-single-file 0.995 0.0
example-10k.html html local 0.686 0.037
ideas-page.html html local 0.929 0.033
fake-html-cp1252.html html local 0.659 0.0
fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0
layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.945 0.029
layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032
2023-Jan-economic-outlook.pdf pdf s3 0.846 0.039
recalibrating-risk-report.pdf pdf s3 0.973 0.007
filename doctype connector cct-accuracy cct-%missing
fake-text.txt txt Sharepoint 1.0 0.0
ideas-page.html html Sharepoint 0.929 0.033
stanley-cups.xlsx xlsx Sharepoint 0.778 0.0
Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf pdf azure 0.981 0.007
IRS-form-1987.pdf pdf azure 0.783 0.135
spring-weather.html html azure 0.0 0.018
example-10k.html html local 0.686 0.037
fake-html-cp1252.html html local 0.659 0.0
ideas-page.html html local 0.929 0.033
UDHR_first_article_all.txt txt local-single-file 0.995 0.0
fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0
layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032
layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.945 0.029
2023-Jan-economic-outlook.pdf pdf s3 0.846 0.039
page-with-formula.pdf pdf s3 0.971 0.021
recalibrating-risk-report.pdf pdf s3 0.973 0.007

1 filename doctype connector cct-accuracy cct-%missing
2 Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf fake-text.txt pdf txt azure Sharepoint 0.981 1.0 0.007 0.0
3 IRS-form-1987.pdf ideas-page.html pdf html azure Sharepoint 0.783 0.929 0.135 0.033
4 spring-weather.html stanley-cups.xlsx html xlsx azure Sharepoint 0.0 0.778 0.018 0.0
5 fake-text.txt Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf txt pdf Sharepoint azure 1.0 0.981 0.0 0.007
6 stanley-cups.xlsx IRS-form-1987.pdf xlsx pdf Sharepoint azure 0.778 0.783 0.0 0.135
7 ideas-page.html spring-weather.html html Sharepoint azure 0.929 0.0 0.033 0.018
8 UDHR_first_article_all.txt example-10k.html txt html local-single-file local 0.995 0.686 0.0 0.037
9 example-10k.html fake-html-cp1252.html html local 0.686 0.659 0.037 0.0
10 ideas-page.html html local 0.929 0.033
11 fake-html-cp1252.html UDHR_first_article_all.txt html txt local local-single-file 0.659 0.995 0.0
12 fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0
13 layout-parser-paper.pdf layout-parser-paper-with-table.jpg pdf jpg local-single-file-with-pdf-infer-table-structure 0.945 0.716 0.029 0.032
14 layout-parser-paper-with-table.jpg layout-parser-paper.pdf jpg pdf local-single-file-with-pdf-infer-table-structure 0.716 0.945 0.032 0.029
15 2023-Jan-economic-outlook.pdf pdf s3 0.846 0.039
16 recalibrating-risk-report.pdf page-with-formula.pdf pdf s3 0.973 0.971 0.007 0.021
17 recalibrating-risk-report.pdf pdf s3 0.973 0.007

View File

@ -36,6 +36,7 @@ def main():
help="Directory to save the output evaluation metrics to. Default to \
your/working/dir/metrics/",
)
@click.option("--grouping", type=str, help="Input field for aggregration, or leave blank if none.")
@click.option(
"--weights",
type=(int, int, int),
@ -50,10 +51,11 @@ def measure_text_edit_distance_command(
output_list: Optional[List[str]],
source_list: Optional[List[str]],
export_dir: str,
grouping: Optional[str],
weights: Tuple[int, int, int],
):
return measure_text_edit_distance(
output_dir, source_dir, output_list, source_list, export_dir, weights
output_dir, source_dir, output_list, source_list, export_dir, grouping, weights
)

View File

@ -7,6 +7,11 @@ def get_element_type_frequency(
) -> Union[Dict[Tuple[str, Optional[int]], int], Dict]:
"""
Calculate the frequency of Element Types from a list of elements.
Args:
elements (str): String-formatted json of all elements (as a result of elements_to_json).
Returns:
Element type and its frequency in dictionary format.
"""
frequency: Dict = {}
if len(elements) == 0:

View File

@ -1,13 +1,13 @@
#! /usr/bin/env python3
import csv
import logging
import os
import statistics
import sys
from typing import Any, List, Optional, Tuple
from typing import List, Optional, Tuple, Union
import click
import pandas as pd
from unstructured.metrics.element_type import (
calculate_element_type_percent_match,
@ -29,7 +29,7 @@ if "ingest_log_handler" not in [h.name for h in logger.handlers]:
logger.setLevel(logging.DEBUG)
agg_headers = ["strategy", "average", "sample_sd", "population_sd", "count"]
agg_headers = ["metric", "average", "sample_sd", "population_sd", "count"]
def measure_text_edit_distance(
@ -38,6 +38,7 @@ def measure_text_edit_distance(
output_list: Optional[List[str]] = None,
source_list: Optional[List[str]] = None,
export_dir: str = "metrics",
grouping: Optional[str] = None,
weights: Tuple[int, int, int] = (2, 1, 1),
) -> None:
"""
@ -58,50 +59,57 @@ def measure_text_edit_distance(
sys.exit(0)
rows = []
accuracy_scores: List[float] = []
percent_missing_scores: List[float] = []
# assumption: output file name convention is name-of-file.doc.json
for doc in output_list: # type: ignore
fn = (doc.split("/")[-1]).split(".json")[0]
doctype = fn.rsplit(".", 1)[-1]
fn_txt = fn + ".txt"
filename = (doc.split("/")[-1]).split(".json")[0]
doctype = filename.rsplit(".", 1)[-1]
fn_txt = filename + ".txt"
connector = doc.split("/")[0]
# not all odetta cct files follow the same naming convention;
# some exclude the original filetype from the name
if fn_txt not in source_list:
fn = filename.rsplit(".", 1)[0]
fn_txt = fn + ".txt"
if fn_txt in source_list: # type: ignore
output_cct = elements_to_text(elements_from_json(os.path.join(output_dir, doc)))
source_cct = _read_text(os.path.join(source_dir, fn_txt))
accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3)
percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3)
rows.append([fn, doctype, connector, accuracy, percent_missing])
accuracy_scores.append(accuracy)
percent_missing_scores.append(percent_missing)
rows.append([filename, doctype, connector, accuracy, percent_missing])
headers = ["filename", "doctype", "connector", "cct-accuracy", "cct-%missing"]
_write_to_file(export_dir, "all-docs-cct.tsv", rows, headers)
df = pd.DataFrame(rows, columns=headers)
export_filename = "all-docs-cct"
agg_rows = []
agg_rows.append(
[
"cct-accuracy",
_mean(accuracy_scores),
_stdev(accuracy_scores),
_pstdev(accuracy_scores),
len(accuracy_scores),
],
)
agg_rows.append(
[
"cct-%missing",
_mean(percent_missing_scores),
_stdev(percent_missing_scores),
_pstdev(percent_missing_scores),
len(percent_missing_scores),
],
)
_write_to_file(export_dir, "aggregate-scores-cct.tsv", agg_rows, agg_headers)
_display(agg_rows, agg_headers)
acc = df[["cct-accuracy"]].agg([_mean, _stdev, _pstdev, "count"]).transpose()
miss = df[["cct-%missing"]].agg([_mean, _stdev, _pstdev, "count"]).transpose()
agg_df = pd.concat((acc, miss)).reset_index()
agg_df.columns = agg_headers
if grouping:
if grouping in ["doctype", "connector"]:
grouped_acc = (
df.groupby(grouping)
.agg({"cct-accuracy": [_mean, _stdev, "count"]})
.rename(columns={"_mean": "mean", "_stdev": "stdev"})
)
grouped_miss = (
df.groupby(grouping)
.agg({"cct-%missing": [_mean, _stdev, "count"]})
.rename(columns={"_mean": "mean", "_stdev": "stdev"})
)
df = _format_grouping_output(grouped_acc, grouped_miss)
export_filename = f"all-{grouping}-agg-cct"
else:
print("No field to group by. Returning a non-group evaluation.")
_write_to_file(export_dir, f"{export_filename}.tsv", df)
_write_to_file(export_dir, "aggregate-scores-cct.tsv", agg_df)
_display(agg_df)
def measure_element_type_accuracy(
@ -124,40 +132,31 @@ def measure_element_type_accuracy(
if not source_list:
source_list = _listdir_recursive(source_dir)
if not output_list:
print("No output files to calculate to element type for, exiting")
sys.exit(0)
rows = []
accuracy_scores: List[float] = []
for doc in output_list: # type: ignore
fn = (doc.split("/")[-1]).split(".json")[0]
doctype = fn.rsplit(".", 1)[-1]
fn_json = fn + ".json"
filename = (doc.split("/")[-1]).split(".json")[0]
doctype = filename.rsplit(".", 1)[-1]
fn_json = filename + ".json"
connector = doc.split("/")[0]
if fn_json in source_list: # type: ignore
output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc)))
source = get_element_type_frequency(_read_text(os.path.join(source_dir, fn_json)))
accuracy = round(calculate_element_type_percent_match(output, source), 3)
rows.append([fn, doctype, connector, accuracy])
accuracy_scores.append(accuracy)
rows.append([filename, doctype, connector, accuracy])
headers = ["filename", "doctype", "connector", "element-type-accuracy"]
_write_to_file(export_dir, "all-docs-element-type-frequency.tsv", rows, headers)
df = pd.DataFrame(rows, columns=headers)
if df.empty:
agg_df = pd.DataFrame(["element-type-accuracy", None, None, None, 0]).transpose()
else:
agg_df = df.agg({"element-type-accuracy": [_mean, _stdev, _pstdev, "count"]}).transpose()
agg_df = agg_df.reset_index()
agg_df.columns = agg_headers
agg_rows = []
agg_rows.append(
[
"element-type-accuracy",
_mean(accuracy_scores),
_stdev(accuracy_scores),
_pstdev(accuracy_scores),
len(accuracy_scores),
],
)
_write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_rows, agg_headers)
_display(agg_rows, agg_headers)
_write_to_file(export_dir, "all-docs-element-type-frequency.tsv", df)
_write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_df)
_display(agg_df)
def _listdir_recursive(dir: str):
@ -173,13 +172,20 @@ def _listdir_recursive(dir: str):
return listdir
def _display(rows, headers):
def _format_grouping_output(*df):
return pd.concat(df, axis=1).reset_index()
def _display(df):
if len(df) == 0:
return
headers = df.columns.tolist()
col_widths = [
max(len(headers[i]), max(len(str(row[i])) for row in rows)) for i in range(len(headers))
max(len(header), max(len(str(item)) for item in df[header])) for header in headers
]
click.echo(" ".join(headers[i].ljust(col_widths[i]) for i in range(len(headers))))
click.echo(" ".join(header.ljust(col_widths[i]) for i, header in enumerate(headers)))
click.echo("-" * sum(col_widths) + "-" * (len(headers) - 1))
for row in rows:
for _, row in df.iterrows():
formatted_row = []
for item in row:
if isinstance(item, float):
@ -191,31 +197,31 @@ def _display(rows, headers):
)
def _write_to_file(dir: str, filename: str, rows: List[Any], headers: List[Any], mode: str = "w"):
def _write_to_file(dir: str, filename: str, df: pd.DataFrame, mode: str = "w"):
if mode not in ["w", "a"]:
raise ValueError("Mode not supported. Mode must be one of [w, a].")
if dir and not os.path.exists(dir):
os.makedirs(dir)
with open(os.path.join(os.path.join(dir, filename)), mode, newline="") as tsv:
writer = csv.writer(tsv, delimiter="\t")
if mode == "w":
writer.writerow(headers)
writer.writerows(rows)
if "count" in df.columns:
df["count"] = df["count"].astype(int)
if "filename" in df.columns and "connector" in df.columns:
df.sort_values(by=["connector", "filename"], inplace=True)
df.to_csv(os.path.join(dir, filename), sep="\t", mode=mode, index=False, header=(mode == "w"))
def _mean(scores: List[float], rounding: Optional[int] = 3):
if len(scores) < 1:
def _mean(scores: Union[pd.Series, List[float]], rounding: Optional[int] = 3):
if len(scores) == 0:
return None
elif len(scores) == 1:
mean = scores[0]
else:
mean = statistics.mean(scores)
mean = statistics.mean(scores)
if not rounding:
return mean
return round(mean, rounding)
def _stdev(scores: List[float], rounding: Optional[int] = 3):
def _stdev(scores: List[Optional[float]], rounding: Optional[int] = 3):
# Filter out None values
scores = [score for score in scores if score is not None]
# Proceed only if there are more than one value
if len(scores) <= 1:
return None
if not rounding:
@ -223,7 +229,8 @@ def _stdev(scores: List[float], rounding: Optional[int] = 3):
return round(statistics.stdev(scores), rounding)
def _pstdev(scores: List[float], rounding: Optional[int] = 3):
def _pstdev(scores: List[Optional[float]], rounding: Optional[int] = 3):
scores = [score for score in scores if score is not None]
if len(scores) <= 1:
return None
if not rounding: