refactor: measure_text_edit_distance function for aggregation (#2108)

- Refactor `metrics/evaluation.py` to accepts `grouping` as parameter. - Switch to `DataFrame` for easier analysis and aggregation.
2025-12-24 13:44:05 +00:00 · 2023-11-22 16:30:16 -05:00 · 2023-11-22 16:30:16 -05:00 · 2c2d5b65ca
commit 2c2d5b65ca
parent d7456ab6d2
10 changed files with 131 additions and 104 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -20,7 +20,7 @@
 ### Features

 * **Add ad-hoc fields to ElementMetadata instance.** End-users can now add their own metadata fields simply by assigning to an element-metadata attribute-name of their choice, like `element.metadata.coefficient = 0.58`. These fields will round-trip through JSON and can be accessed with dotted notation.
-* **MongoDB Destination Connector** New destination connector added to all CLI ingest commands to support writing partitioned json output to mongodb.
+* **MongoDB Destination Connector.** New destination connector added to all CLI ingest commands to support writing partitioned json output to mongodb.

 ### Fixes

--- a/test_unstructured/metrics/test_evaluate.py
+++ b/test_unstructured/metrics/test_evaluate.py
@ -1,6 +1,7 @@
 import os
 import pathlib

+import pandas as pd
 import pytest

 from unstructured.metrics.evaluate import (
@ -34,3 +35,15 @@ def test_text_extraction_takes_list():
    with open(os.path.join(export_dir, "all-docs-cct.tsv")) as f:
        lines = f.read().splitlines()
    assert len(lines) == len(output_list) + 1  # includes header
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+def test_text_extraction_grouping():
+    output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
+    source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
+    export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
+    measure_text_edit_distance(
+        output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype"
+    )
+    df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
+    assert len(df) == 4
--- a/test_unstructured_ingest/check-diff-evaluation-metrics.sh
+++ b/test_unstructured_ingest/check-diff-evaluation-metrics.sh
@ -8,7 +8,7 @@
 # Environment Variables:
 #   - OVERWRITE_FIXTURES: Controls whether to overwrite fixtures or not. default: "false"

-set -e
+set +e

 SCRIPT_DIR=$(dirname "$(realpath "$0")")
 OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
@ -45,10 +45,9 @@ if [ "$OVERWRITE_FIXTURES" != "false" ]; then
    # force copy (overwrite) files from metrics-tmp (new eval metrics) to metrics (old eval metrics)
    mkdir -p "$METRICS_DIR"
    cp -rf "$TMP_METRICS_LATEST_RUN_DIR" "$OUTPUT_ROOT/metrics"
-elif ! diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR" ; then
+elif ! diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR" ; then  
    "$SCRIPT_DIR"/clean-permissions-files.sh "$TMP_METRICS_LATEST_RUN_DIR"
-    diff -r "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR"> metricsdiff.txt
-    cat metricsdiff.txt
+    diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR"> metricsdiff.txt
    diffstat -c metricsdiff.txt
    echo
    echo "There are differences from the previously checked-in structured outputs."
--- a/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv
+++ b/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv
@ -1,2 +1,2 @@
-strategy	average	sample_sd	population_sd	count
-element-type-accuracy	0.814	0.108	0.077	2
+metric	average	sample_sd	population_sd	count
+element-type-accuracy	0.814	0.108	0.077	2
--- a/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv
+++ b/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv
@ -1,3 +1,3 @@
-filename	doctype	connector	element-type-accuracy
-IRS-form-1987.pdf	pdf	azure	0.89
-page-with-formula.pdf	pdf	s3	0.737
+filename	doctype	connector	element-type-accuracy
+IRS-form-1987.pdf	pdf	azure	0.89
+page-with-formula.pdf	pdf	s3	0.737
--- a/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv
+++ b/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv
@ -1,3 +1,3 @@
-strategy	average	sample_sd	population_sd	count
-cct-accuracy	0.792	0.253	0.245	15
-cct-%missing	0.025	0.034	0.033	15
+metric	average	sample_sd	population_sd	count
+cct-accuracy	0.803	0.249	0.241	16
+cct-%missing	0.024	0.033	0.032	16
--- a/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv
+++ b/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv
@ -1,16 +1,17 @@
-filename	doctype	connector	cct-accuracy	cct-%missing
-Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf	pdf	azure	0.981	0.007
-IRS-form-1987.pdf	pdf	azure	0.783	0.135
-spring-weather.html	html	azure	0.0	0.018
-fake-text.txt	txt	Sharepoint	1.0	0.0
-stanley-cups.xlsx	xlsx	Sharepoint	0.778	0.0
-ideas-page.html	html	Sharepoint	0.929	0.033
-UDHR_first_article_all.txt	txt	local-single-file	0.995	0.0
-example-10k.html	html	local	0.686	0.037
-ideas-page.html	html	local	0.929	0.033
-fake-html-cp1252.html	html	local	0.659	0.0
-fake-html-cp1252.html	html	local-single-file-with-encoding	0.659	0.0
-layout-parser-paper.pdf	pdf	local-single-file-with-pdf-infer-table-structure	0.945	0.029
-layout-parser-paper-with-table.jpg	jpg	local-single-file-with-pdf-infer-table-structure	0.716	0.032
-2023-Jan-economic-outlook.pdf	pdf	s3	0.846	0.039
-recalibrating-risk-report.pdf	pdf	s3	0.973	0.007
+filename	doctype	connector	cct-accuracy	cct-%missing
+fake-text.txt	txt	Sharepoint	1.0	0.0
+ideas-page.html	html	Sharepoint	0.929	0.033
+stanley-cups.xlsx	xlsx	Sharepoint	0.778	0.0
+Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf	pdf	azure	0.981	0.007
+IRS-form-1987.pdf	pdf	azure	0.783	0.135
+spring-weather.html	html	azure	0.0	0.018
+example-10k.html	html	local	0.686	0.037
+fake-html-cp1252.html	html	local	0.659	0.0
+ideas-page.html	html	local	0.929	0.033
+UDHR_first_article_all.txt	txt	local-single-file	0.995	0.0
+fake-html-cp1252.html	html	local-single-file-with-encoding	0.659	0.0
+layout-parser-paper-with-table.jpg	jpg	local-single-file-with-pdf-infer-table-structure	0.716	0.032
+layout-parser-paper.pdf	pdf	local-single-file-with-pdf-infer-table-structure	0.945	0.029
+2023-Jan-economic-outlook.pdf	pdf	s3	0.846	0.039
+page-with-formula.pdf	pdf	s3	0.971	0.021
+recalibrating-risk-report.pdf	pdf	s3	0.973	0.007
--- a/unstructured/ingest/evaluate.py
+++ b/unstructured/ingest/evaluate.py
@ -36,6 +36,7 @@ def main():
    help="Directory to save the output evaluation metrics to. Default to \
        your/working/dir/metrics/",
 )
+@click.option("--grouping", type=str, help="Input field for aggregration, or leave blank if none.")
@click.option(
    "--weights",
    type=(int, int, int),
@ -50,10 +51,11 @@ def measure_text_edit_distance_command(
    output_list: Optional[List[str]],
    source_list: Optional[List[str]],
    export_dir: str,
+    grouping: Optional[str],
    weights: Tuple[int, int, int],
 ):
    return measure_text_edit_distance(
-        output_dir, source_dir, output_list, source_list, export_dir, weights
+        output_dir, source_dir, output_list, source_list, export_dir, grouping, weights
    )


--- a/unstructured/metrics/element_type.py
+++ b/unstructured/metrics/element_type.py
@ -7,6 +7,11 @@ def get_element_type_frequency(
 ) -> Union[Dict[Tuple[str, Optional[int]], int], Dict]:
    """
    Calculate the frequency of Element Types from a list of elements.
+
+    Args:
+        elements (str): String-formatted json of all elements (as a result of elements_to_json).
+    Returns:
+        Element type and its frequency in dictionary format.
    """
    frequency: Dict = {}
    if len(elements) == 0:
--- a/unstructured/metrics/evaluate.py
+++ b/unstructured/metrics/evaluate.py
@ -1,13 +1,13 @@
 #! /usr/bin/env python3

-import csv
 import logging
 import os
 import statistics
 import sys
-from typing import Any, List, Optional, Tuple
+from typing import List, Optional, Tuple, Union

 import click
+import pandas as pd

 from unstructured.metrics.element_type import (
    calculate_element_type_percent_match,
@ -29,7 +29,7 @@ if "ingest_log_handler" not in [h.name for h in logger.handlers]:
 logger.setLevel(logging.DEBUG)


-agg_headers = ["strategy", "average", "sample_sd", "population_sd", "count"]
+agg_headers = ["metric", "average", "sample_sd", "population_sd", "count"]


 def measure_text_edit_distance(
@ -38,6 +38,7 @@ def measure_text_edit_distance(
    output_list: Optional[List[str]] = None,
    source_list: Optional[List[str]] = None,
    export_dir: str = "metrics",
+    grouping: Optional[str] = None,
    weights: Tuple[int, int, int] = (2, 1, 1),
 ) -> None:
    """
@ -58,50 +59,57 @@ def measure_text_edit_distance(
        sys.exit(0)

    rows = []
-    accuracy_scores: List[float] = []
-    percent_missing_scores: List[float] = []

    # assumption: output file name convention is name-of-file.doc.json
    for doc in output_list:  # type: ignore
-        fn = (doc.split("/")[-1]).split(".json")[0]
-        doctype = fn.rsplit(".", 1)[-1]
-        fn_txt = fn + ".txt"
+        filename = (doc.split("/")[-1]).split(".json")[0]
+        doctype = filename.rsplit(".", 1)[-1]
+        fn_txt = filename + ".txt"
        connector = doc.split("/")[0]

+        # not all odetta cct files follow the same naming convention;
+        # some exclude the original filetype from the name
+        if fn_txt not in source_list:
+            fn = filename.rsplit(".", 1)[0]
+            fn_txt = fn + ".txt"
+
        if fn_txt in source_list:  # type: ignore
            output_cct = elements_to_text(elements_from_json(os.path.join(output_dir, doc)))
            source_cct = _read_text(os.path.join(source_dir, fn_txt))
            accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3)
            percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3)

-            rows.append([fn, doctype, connector, accuracy, percent_missing])
-            accuracy_scores.append(accuracy)
-            percent_missing_scores.append(percent_missing)
+            rows.append([filename, doctype, connector, accuracy, percent_missing])

    headers = ["filename", "doctype", "connector", "cct-accuracy", "cct-%missing"]
-    _write_to_file(export_dir, "all-docs-cct.tsv", rows, headers)
+    df = pd.DataFrame(rows, columns=headers)
+    export_filename = "all-docs-cct"

-    agg_rows = []
-    agg_rows.append(
-        [
-            "cct-accuracy",
-            _mean(accuracy_scores),
-            _stdev(accuracy_scores),
-            _pstdev(accuracy_scores),
-            len(accuracy_scores),
-        ],
-    )
-    agg_rows.append(
-        [
-            "cct-%missing",
-            _mean(percent_missing_scores),
-            _stdev(percent_missing_scores),
-            _pstdev(percent_missing_scores),
-            len(percent_missing_scores),
-        ],
-    )
-    _write_to_file(export_dir, "aggregate-scores-cct.tsv", agg_rows, agg_headers)
-    _display(agg_rows, agg_headers)
+    acc = df[["cct-accuracy"]].agg([_mean, _stdev, _pstdev, "count"]).transpose()
+    miss = df[["cct-%missing"]].agg([_mean, _stdev, _pstdev, "count"]).transpose()
+    agg_df = pd.concat((acc, miss)).reset_index()
+    agg_df.columns = agg_headers
+
+    if grouping:
+        if grouping in ["doctype", "connector"]:
+            grouped_acc = (
+                df.groupby(grouping)
+                .agg({"cct-accuracy": [_mean, _stdev, "count"]})
+                .rename(columns={"_mean": "mean", "_stdev": "stdev"})
+            )
+            grouped_miss = (
+                df.groupby(grouping)
+                .agg({"cct-%missing": [_mean, _stdev, "count"]})
+                .rename(columns={"_mean": "mean", "_stdev": "stdev"})
+            )
+            df = _format_grouping_output(grouped_acc, grouped_miss)
+            export_filename = f"all-{grouping}-agg-cct"
+        else:
+            print("No field to group by. Returning a non-group evaluation.")
+
+    _write_to_file(export_dir, f"{export_filename}.tsv", df)
+    _write_to_file(export_dir, "aggregate-scores-cct.tsv", agg_df)
+    _display(agg_df)


 def measure_element_type_accuracy(
@ -124,40 +132,31 @@ def measure_element_type_accuracy(
    if not source_list:
        source_list = _listdir_recursive(source_dir)

-    if not output_list:
-        print("No output files to calculate to element type for, exiting")
-        sys.exit(0)
-
    rows = []
-    accuracy_scores: List[float] = []

    for doc in output_list:  # type: ignore
-        fn = (doc.split("/")[-1]).split(".json")[0]
-        doctype = fn.rsplit(".", 1)[-1]
-        fn_json = fn + ".json"
+        filename = (doc.split("/")[-1]).split(".json")[0]
+        doctype = filename.rsplit(".", 1)[-1]
+        fn_json = filename + ".json"
        connector = doc.split("/")[0]
        if fn_json in source_list:  # type: ignore
            output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc)))
            source = get_element_type_frequency(_read_text(os.path.join(source_dir, fn_json)))
            accuracy = round(calculate_element_type_percent_match(output, source), 3)
-            rows.append([fn, doctype, connector, accuracy])
-            accuracy_scores.append(accuracy)
+            rows.append([filename, doctype, connector, accuracy])

    headers = ["filename", "doctype", "connector", "element-type-accuracy"]
-    _write_to_file(export_dir, "all-docs-element-type-frequency.tsv", rows, headers)
+    df = pd.DataFrame(rows, columns=headers)
+    if df.empty:
+        agg_df = pd.DataFrame(["element-type-accuracy", None, None, None, 0]).transpose()
+    else:
+        agg_df = df.agg({"element-type-accuracy": [_mean, _stdev, _pstdev, "count"]}).transpose()
+        agg_df = agg_df.reset_index()
+    agg_df.columns = agg_headers

-    agg_rows = []
-    agg_rows.append(
-        [
-            "element-type-accuracy",
-            _mean(accuracy_scores),
-            _stdev(accuracy_scores),
-            _pstdev(accuracy_scores),
-            len(accuracy_scores),
-        ],
-    )
-    _write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_rows, agg_headers)
-    _display(agg_rows, agg_headers)
+    _write_to_file(export_dir, "all-docs-element-type-frequency.tsv", df)
+    _write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_df)
+    _display(agg_df)


 def _listdir_recursive(dir: str):
@ -173,13 +172,20 @@ def _listdir_recursive(dir: str):
    return listdir


-def _display(rows, headers):
+def _format_grouping_output(*df):
+    return pd.concat(df, axis=1).reset_index()
+
+
+def _display(df):
+    if len(df) == 0:
+        return
+    headers = df.columns.tolist()
    col_widths = [
-        max(len(headers[i]), max(len(str(row[i])) for row in rows)) for i in range(len(headers))
+        max(len(header), max(len(str(item)) for item in df[header])) for header in headers
    ]
-    click.echo(" ".join(headers[i].ljust(col_widths[i]) for i in range(len(headers))))
+    click.echo(" ".join(header.ljust(col_widths[i]) for i, header in enumerate(headers)))
    click.echo("-" * sum(col_widths) + "-" * (len(headers) - 1))
-    for row in rows:
+    for _, row in df.iterrows():
        formatted_row = []
        for item in row:
            if isinstance(item, float):
@ -191,31 +197,31 @@ def _display(rows, headers):
        )


-def _write_to_file(dir: str, filename: str, rows: List[Any], headers: List[Any], mode: str = "w"):
+def _write_to_file(dir: str, filename: str, df: pd.DataFrame, mode: str = "w"):
    if mode not in ["w", "a"]:
        raise ValueError("Mode not supported. Mode must be one of [w, a].")
    if dir and not os.path.exists(dir):
        os.makedirs(dir)
-    with open(os.path.join(os.path.join(dir, filename)), mode, newline="") as tsv:
-        writer = csv.writer(tsv, delimiter="\t")
-        if mode == "w":
-            writer.writerow(headers)
-        writer.writerows(rows)
+    if "count" in df.columns:
+        df["count"] = df["count"].astype(int)
+    if "filename" in df.columns and "connector" in df.columns:
+        df.sort_values(by=["connector", "filename"], inplace=True)
+    df.to_csv(os.path.join(dir, filename), sep="\t", mode=mode, index=False, header=(mode == "w"))


-def _mean(scores: List[float], rounding: Optional[int] = 3):
-    if len(scores) < 1:
+def _mean(scores: Union[pd.Series, List[float]], rounding: Optional[int] = 3):
+    if len(scores) == 0:
        return None
-    elif len(scores) == 1:
-        mean = scores[0]
-    else:
-        mean = statistics.mean(scores)
+    mean = statistics.mean(scores)
    if not rounding:
        return mean
    return round(mean, rounding)


-def _stdev(scores: List[float], rounding: Optional[int] = 3):
+def _stdev(scores: List[Optional[float]], rounding: Optional[int] = 3):
+    # Filter out None values
+    scores = [score for score in scores if score is not None]
+    # Proceed only if there are more than one value
    if len(scores) <= 1:
        return None
    if not rounding:
@ -223,7 +229,8 @@ def _stdev(scores: List[float], rounding: Optional[int] = 3):
    return round(statistics.stdev(scores), rounding)


-def _pstdev(scores: List[float], rounding: Optional[int] = 3):
+def _pstdev(scores: List[Optional[float]], rounding: Optional[int] = 3):
+    scores = [score for score in scores if score is not None]
    if len(scores) <= 1:
        return None
    if not rounding: