diff --git a/CHANGELOG.md b/CHANGELOG.md index c405be937..f2c0e0139 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ ### Features * **Add ad-hoc fields to ElementMetadata instance.** End-users can now add their own metadata fields simply by assigning to an element-metadata attribute-name of their choice, like `element.metadata.coefficient = 0.58`. These fields will round-trip through JSON and can be accessed with dotted notation. -* **MongoDB Destination Connector** New destination connector added to all CLI ingest commands to support writing partitioned json output to mongodb. +* **MongoDB Destination Connector.** New destination connector added to all CLI ingest commands to support writing partitioned json output to mongodb. ### Fixes diff --git a/test_unstructured/metrics/test_evaluate.py b/test_unstructured/metrics/test_evaluate.py index 8053d12ba..92cbba2f1 100644 --- a/test_unstructured/metrics/test_evaluate.py +++ b/test_unstructured/metrics/test_evaluate.py @@ -1,6 +1,7 @@ import os import pathlib +import pandas as pd import pytest from unstructured.metrics.evaluate import ( @@ -34,3 +35,15 @@ def test_text_extraction_takes_list(): with open(os.path.join(export_dir, "all-docs-cct.tsv")) as f: lines = f.read().splitlines() assert len(lines) == len(output_list) + 1 # includes header + + +@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") +def test_text_extraction_grouping(): + output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME) + source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME) + export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct") + measure_text_edit_distance( + output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype" + ) + df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t") + assert len(df) == 4 diff --git a/test_unstructured_ingest/check-diff-evaluation-metrics.sh b/test_unstructured_ingest/check-diff-evaluation-metrics.sh index 90d6555ac..1c18082be 100755 --- a/test_unstructured_ingest/check-diff-evaluation-metrics.sh +++ b/test_unstructured_ingest/check-diff-evaluation-metrics.sh @@ -8,7 +8,7 @@ # Environment Variables: # - OVERWRITE_FIXTURES: Controls whether to overwrite fixtures or not. default: "false" -set -e +set +e SCRIPT_DIR=$(dirname "$(realpath "$0")") OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false} @@ -45,10 +45,9 @@ if [ "$OVERWRITE_FIXTURES" != "false" ]; then # force copy (overwrite) files from metrics-tmp (new eval metrics) to metrics (old eval metrics) mkdir -p "$METRICS_DIR" cp -rf "$TMP_METRICS_LATEST_RUN_DIR" "$OUTPUT_ROOT/metrics" -elif ! diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR" ; then +elif ! diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR" ; then "$SCRIPT_DIR"/clean-permissions-files.sh "$TMP_METRICS_LATEST_RUN_DIR" - diff -r "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR"> metricsdiff.txt - cat metricsdiff.txt + diff -ru "$METRICS_DIR" "$TMP_METRICS_LATEST_RUN_DIR"> metricsdiff.txt diffstat -c metricsdiff.txt echo echo "There are differences from the previously checked-in structured outputs." diff --git a/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv b/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv index 9139c72b7..526bd6f08 100644 --- a/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv +++ b/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv @@ -1,2 +1,2 @@ -strategy average sample_sd population_sd count -element-type-accuracy 0.814 0.108 0.077 2 +metric average sample_sd population_sd count +element-type-accuracy 0.814 0.108 0.077 2 diff --git a/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv b/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv index 4d141d4f7..aa368f746 100644 --- a/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv +++ b/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv @@ -1,3 +1,3 @@ -filename doctype connector element-type-accuracy -IRS-form-1987.pdf pdf azure 0.89 -page-with-formula.pdf pdf s3 0.737 +filename doctype connector element-type-accuracy +IRS-form-1987.pdf pdf azure 0.89 +page-with-formula.pdf pdf s3 0.737 diff --git a/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv b/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv index bc8003243..8865cec2c 100644 --- a/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv +++ b/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv @@ -1,3 +1,3 @@ -strategy average sample_sd population_sd count -cct-accuracy 0.792 0.253 0.245 15 -cct-%missing 0.025 0.034 0.033 15 +metric average sample_sd population_sd count +cct-accuracy 0.803 0.249 0.241 16 +cct-%missing 0.024 0.033 0.032 16 diff --git a/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv b/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv index 6a5dc96cf..9858ebfdd 100644 --- a/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv +++ b/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv @@ -1,16 +1,17 @@ -filename doctype connector cct-accuracy cct-%missing -Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf pdf azure 0.981 0.007 -IRS-form-1987.pdf pdf azure 0.783 0.135 -spring-weather.html html azure 0.0 0.018 -fake-text.txt txt Sharepoint 1.0 0.0 -stanley-cups.xlsx xlsx Sharepoint 0.778 0.0 -ideas-page.html html Sharepoint 0.929 0.033 -UDHR_first_article_all.txt txt local-single-file 0.995 0.0 -example-10k.html html local 0.686 0.037 -ideas-page.html html local 0.929 0.033 -fake-html-cp1252.html html local 0.659 0.0 -fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0 -layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.945 0.029 -layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032 -2023-Jan-economic-outlook.pdf pdf s3 0.846 0.039 -recalibrating-risk-report.pdf pdf s3 0.973 0.007 +filename doctype connector cct-accuracy cct-%missing +fake-text.txt txt Sharepoint 1.0 0.0 +ideas-page.html html Sharepoint 0.929 0.033 +stanley-cups.xlsx xlsx Sharepoint 0.778 0.0 +Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf pdf azure 0.981 0.007 +IRS-form-1987.pdf pdf azure 0.783 0.135 +spring-weather.html html azure 0.0 0.018 +example-10k.html html local 0.686 0.037 +fake-html-cp1252.html html local 0.659 0.0 +ideas-page.html html local 0.929 0.033 +UDHR_first_article_all.txt txt local-single-file 0.995 0.0 +fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0 +layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032 +layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.945 0.029 +2023-Jan-economic-outlook.pdf pdf s3 0.846 0.039 +page-with-formula.pdf pdf s3 0.971 0.021 +recalibrating-risk-report.pdf pdf s3 0.973 0.007 diff --git a/unstructured/ingest/evaluate.py b/unstructured/ingest/evaluate.py index 7b055cf4f..a3e5dfe0a 100755 --- a/unstructured/ingest/evaluate.py +++ b/unstructured/ingest/evaluate.py @@ -36,6 +36,7 @@ def main(): help="Directory to save the output evaluation metrics to. Default to \ your/working/dir/metrics/", ) +@click.option("--grouping", type=str, help="Input field for aggregration, or leave blank if none.") @click.option( "--weights", type=(int, int, int), @@ -50,10 +51,11 @@ def measure_text_edit_distance_command( output_list: Optional[List[str]], source_list: Optional[List[str]], export_dir: str, + grouping: Optional[str], weights: Tuple[int, int, int], ): return measure_text_edit_distance( - output_dir, source_dir, output_list, source_list, export_dir, weights + output_dir, source_dir, output_list, source_list, export_dir, grouping, weights ) diff --git a/unstructured/metrics/element_type.py b/unstructured/metrics/element_type.py index 6c77d6eb1..3e4e8cbf8 100644 --- a/unstructured/metrics/element_type.py +++ b/unstructured/metrics/element_type.py @@ -7,6 +7,11 @@ def get_element_type_frequency( ) -> Union[Dict[Tuple[str, Optional[int]], int], Dict]: """ Calculate the frequency of Element Types from a list of elements. + + Args: + elements (str): String-formatted json of all elements (as a result of elements_to_json). + Returns: + Element type and its frequency in dictionary format. """ frequency: Dict = {} if len(elements) == 0: diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py index c129cfb4e..79095fe60 100755 --- a/unstructured/metrics/evaluate.py +++ b/unstructured/metrics/evaluate.py @@ -1,13 +1,13 @@ #! /usr/bin/env python3 -import csv import logging import os import statistics import sys -from typing import Any, List, Optional, Tuple +from typing import List, Optional, Tuple, Union import click +import pandas as pd from unstructured.metrics.element_type import ( calculate_element_type_percent_match, @@ -29,7 +29,7 @@ if "ingest_log_handler" not in [h.name for h in logger.handlers]: logger.setLevel(logging.DEBUG) -agg_headers = ["strategy", "average", "sample_sd", "population_sd", "count"] +agg_headers = ["metric", "average", "sample_sd", "population_sd", "count"] def measure_text_edit_distance( @@ -38,6 +38,7 @@ def measure_text_edit_distance( output_list: Optional[List[str]] = None, source_list: Optional[List[str]] = None, export_dir: str = "metrics", + grouping: Optional[str] = None, weights: Tuple[int, int, int] = (2, 1, 1), ) -> None: """ @@ -58,50 +59,57 @@ def measure_text_edit_distance( sys.exit(0) rows = [] - accuracy_scores: List[float] = [] - percent_missing_scores: List[float] = [] # assumption: output file name convention is name-of-file.doc.json for doc in output_list: # type: ignore - fn = (doc.split("/")[-1]).split(".json")[0] - doctype = fn.rsplit(".", 1)[-1] - fn_txt = fn + ".txt" + filename = (doc.split("/")[-1]).split(".json")[0] + doctype = filename.rsplit(".", 1)[-1] + fn_txt = filename + ".txt" connector = doc.split("/")[0] + # not all odetta cct files follow the same naming convention; + # some exclude the original filetype from the name + if fn_txt not in source_list: + fn = filename.rsplit(".", 1)[0] + fn_txt = fn + ".txt" + if fn_txt in source_list: # type: ignore output_cct = elements_to_text(elements_from_json(os.path.join(output_dir, doc))) source_cct = _read_text(os.path.join(source_dir, fn_txt)) accuracy = round(calculate_accuracy(output_cct, source_cct, weights), 3) percent_missing = round(calculate_percent_missing_text(output_cct, source_cct), 3) - rows.append([fn, doctype, connector, accuracy, percent_missing]) - accuracy_scores.append(accuracy) - percent_missing_scores.append(percent_missing) + rows.append([filename, doctype, connector, accuracy, percent_missing]) headers = ["filename", "doctype", "connector", "cct-accuracy", "cct-%missing"] - _write_to_file(export_dir, "all-docs-cct.tsv", rows, headers) + df = pd.DataFrame(rows, columns=headers) + export_filename = "all-docs-cct" - agg_rows = [] - agg_rows.append( - [ - "cct-accuracy", - _mean(accuracy_scores), - _stdev(accuracy_scores), - _pstdev(accuracy_scores), - len(accuracy_scores), - ], - ) - agg_rows.append( - [ - "cct-%missing", - _mean(percent_missing_scores), - _stdev(percent_missing_scores), - _pstdev(percent_missing_scores), - len(percent_missing_scores), - ], - ) - _write_to_file(export_dir, "aggregate-scores-cct.tsv", agg_rows, agg_headers) - _display(agg_rows, agg_headers) + acc = df[["cct-accuracy"]].agg([_mean, _stdev, _pstdev, "count"]).transpose() + miss = df[["cct-%missing"]].agg([_mean, _stdev, _pstdev, "count"]).transpose() + agg_df = pd.concat((acc, miss)).reset_index() + agg_df.columns = agg_headers + + if grouping: + if grouping in ["doctype", "connector"]: + grouped_acc = ( + df.groupby(grouping) + .agg({"cct-accuracy": [_mean, _stdev, "count"]}) + .rename(columns={"_mean": "mean", "_stdev": "stdev"}) + ) + grouped_miss = ( + df.groupby(grouping) + .agg({"cct-%missing": [_mean, _stdev, "count"]}) + .rename(columns={"_mean": "mean", "_stdev": "stdev"}) + ) + df = _format_grouping_output(grouped_acc, grouped_miss) + export_filename = f"all-{grouping}-agg-cct" + else: + print("No field to group by. Returning a non-group evaluation.") + + _write_to_file(export_dir, f"{export_filename}.tsv", df) + _write_to_file(export_dir, "aggregate-scores-cct.tsv", agg_df) + _display(agg_df) def measure_element_type_accuracy( @@ -124,40 +132,31 @@ def measure_element_type_accuracy( if not source_list: source_list = _listdir_recursive(source_dir) - if not output_list: - print("No output files to calculate to element type for, exiting") - sys.exit(0) - rows = [] - accuracy_scores: List[float] = [] for doc in output_list: # type: ignore - fn = (doc.split("/")[-1]).split(".json")[0] - doctype = fn.rsplit(".", 1)[-1] - fn_json = fn + ".json" + filename = (doc.split("/")[-1]).split(".json")[0] + doctype = filename.rsplit(".", 1)[-1] + fn_json = filename + ".json" connector = doc.split("/")[0] if fn_json in source_list: # type: ignore output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc))) source = get_element_type_frequency(_read_text(os.path.join(source_dir, fn_json))) accuracy = round(calculate_element_type_percent_match(output, source), 3) - rows.append([fn, doctype, connector, accuracy]) - accuracy_scores.append(accuracy) + rows.append([filename, doctype, connector, accuracy]) headers = ["filename", "doctype", "connector", "element-type-accuracy"] - _write_to_file(export_dir, "all-docs-element-type-frequency.tsv", rows, headers) + df = pd.DataFrame(rows, columns=headers) + if df.empty: + agg_df = pd.DataFrame(["element-type-accuracy", None, None, None, 0]).transpose() + else: + agg_df = df.agg({"element-type-accuracy": [_mean, _stdev, _pstdev, "count"]}).transpose() + agg_df = agg_df.reset_index() + agg_df.columns = agg_headers - agg_rows = [] - agg_rows.append( - [ - "element-type-accuracy", - _mean(accuracy_scores), - _stdev(accuracy_scores), - _pstdev(accuracy_scores), - len(accuracy_scores), - ], - ) - _write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_rows, agg_headers) - _display(agg_rows, agg_headers) + _write_to_file(export_dir, "all-docs-element-type-frequency.tsv", df) + _write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_df) + _display(agg_df) def _listdir_recursive(dir: str): @@ -173,13 +172,20 @@ def _listdir_recursive(dir: str): return listdir -def _display(rows, headers): +def _format_grouping_output(*df): + return pd.concat(df, axis=1).reset_index() + + +def _display(df): + if len(df) == 0: + return + headers = df.columns.tolist() col_widths = [ - max(len(headers[i]), max(len(str(row[i])) for row in rows)) for i in range(len(headers)) + max(len(header), max(len(str(item)) for item in df[header])) for header in headers ] - click.echo(" ".join(headers[i].ljust(col_widths[i]) for i in range(len(headers)))) + click.echo(" ".join(header.ljust(col_widths[i]) for i, header in enumerate(headers))) click.echo("-" * sum(col_widths) + "-" * (len(headers) - 1)) - for row in rows: + for _, row in df.iterrows(): formatted_row = [] for item in row: if isinstance(item, float): @@ -191,31 +197,31 @@ def _display(rows, headers): ) -def _write_to_file(dir: str, filename: str, rows: List[Any], headers: List[Any], mode: str = "w"): +def _write_to_file(dir: str, filename: str, df: pd.DataFrame, mode: str = "w"): if mode not in ["w", "a"]: raise ValueError("Mode not supported. Mode must be one of [w, a].") if dir and not os.path.exists(dir): os.makedirs(dir) - with open(os.path.join(os.path.join(dir, filename)), mode, newline="") as tsv: - writer = csv.writer(tsv, delimiter="\t") - if mode == "w": - writer.writerow(headers) - writer.writerows(rows) + if "count" in df.columns: + df["count"] = df["count"].astype(int) + if "filename" in df.columns and "connector" in df.columns: + df.sort_values(by=["connector", "filename"], inplace=True) + df.to_csv(os.path.join(dir, filename), sep="\t", mode=mode, index=False, header=(mode == "w")) -def _mean(scores: List[float], rounding: Optional[int] = 3): - if len(scores) < 1: +def _mean(scores: Union[pd.Series, List[float]], rounding: Optional[int] = 3): + if len(scores) == 0: return None - elif len(scores) == 1: - mean = scores[0] - else: - mean = statistics.mean(scores) + mean = statistics.mean(scores) if not rounding: return mean return round(mean, rounding) -def _stdev(scores: List[float], rounding: Optional[int] = 3): +def _stdev(scores: List[Optional[float]], rounding: Optional[int] = 3): + # Filter out None values + scores = [score for score in scores if score is not None] + # Proceed only if there are more than one value if len(scores) <= 1: return None if not rounding: @@ -223,7 +229,8 @@ def _stdev(scores: List[float], rounding: Optional[int] = 3): return round(statistics.stdev(scores), rounding) -def _pstdev(scores: List[float], rounding: Optional[int] = 3): +def _pstdev(scores: List[Optional[float]], rounding: Optional[int] = 3): + scores = [score for score in scores if score is not None] if len(scores) <= 1: return None if not rounding: