feat: add visualize param to command and add test (#2178)

- Add `visualize` parameter to the click command -- now callable using `--visualize` flag to show the progress bar. - Refactor the name.
2025-06-27 02:30:08 +00:00 · 2023-11-28 20:05:55 -05:00 · 2023-11-28 20:05:55 -05:00 · 0aae1faa54
commit 0aae1faa54
parent 50b1431c9e
8 changed files with 6418 additions and 97 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -13,7 +13,7 @@
 * **Do not extract text of `<style>` tags in HTML.** `<style>` tags containing CSS in invalid positions previously contributed to element text. Do not consider text node of a `<style>` element as textual content.
 * **Fix DOCX merged table cell repeats cell text.** Only include text for a merged cell, not for each underlying cell spanned by the merge.
 * **Fix tables not extracted from DOCX header/footers.** Headers and footers in DOCX documents skip tables defined in the header and commonly used for layout/alignment purposes. Extract text from tables as a string and include in the `Header` and `Footer` document elements.
-* **Fix output filepath for fsspec-based source connectors** Previously the base directory was being included in the output filepath unnecessarily.
+* **Fix output filepath for fsspec-based source connectors.** Previously the base directory was being included in the output filepath unnecessarily.

 ## 0.11.0

--- a/example-docs/test_evaluate_files/gold_standard_element_type/IRS-form-1987.pdf.json
+++ b/example-docs/test_evaluate_files/gold_standard_element_type/IRS-form-1987.pdf.json
--- a/example-docs/test_evaluate_files/unstructured_output/IRS-form-1987.pdf.json
+++ b/example-docs/test_evaluate_files/unstructured_output/IRS-form-1987.pdf.json
--- a/test_unstructured/metrics/test_evaluate.py
+++ b/test_unstructured/metrics/test_evaluate.py
@ -5,7 +5,8 @@ import pandas as pd
 import pytest

 from unstructured.metrics.evaluate import (
-    measure_text_edit_distance,
+    measure_element_type_accuracy,
+    measure_text_extraction_accuracy,
 )

 is_in_docker = os.path.exists("/.dockerenv")
@ -17,6 +18,37 @@ TESTING_FILE_DIR = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_evaluate_files")

 UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output"
 GOLD_CCT_DIRNAME = "gold_standard_cct"
+GOLD_ELEMENT_TYPE_DIRNAME = "gold_standard_element_type"
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+def test_text_extraction_evaluation():
+    output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
+    source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
+    export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
+    measure_text_extraction_accuracy(
+        output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
+    )
+    assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv"))
+    df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
+    assert len(df) == 3
+    assert len(df.columns) == 5
+    assert df.iloc[0].filename == "Bank Good Credit Loan.pptx"
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+def test_element_type_evaluation():
+    output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
+    source_dir = os.path.join(TESTING_FILE_DIR, GOLD_ELEMENT_TYPE_DIRNAME)
+    export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
+    measure_element_type_accuracy(
+        output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
+    )
+    assert os.path.isfile(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"))
+    df = pd.read_csv(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"), sep="\t")
+    assert len(df) == 1
+    assert len(df.columns) == 4
+    assert df.iloc[0].filename == "IRS-form-1987.pdf"


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@ -25,16 +57,15 @@ def test_text_extraction_takes_list():
    output_list = ["currency.csv.json"]
    source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
    export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
-    measure_text_edit_distance(
+    measure_text_extraction_accuracy(
        output_dir=output_dir,
        source_dir=source_dir,
        output_list=output_list,
        export_dir=export_dir,
    )
    # check that only the listed files are included
-    with open(os.path.join(export_dir, "all-docs-cct.tsv")) as f:
-        lines = f.read().splitlines()
-    assert len(lines) == len(output_list) + 1  # includes header
+    df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
+    assert len(df) == len(output_list)


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@ -42,8 +73,8 @@ def test_text_extraction_grouping():
    output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
    source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
    export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
-    measure_text_edit_distance(
+    measure_text_extraction_accuracy(
        output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype"
    )
    df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
-    assert len(df) == 4
+    assert len(df) == 4  # metrics row and doctype rows
--- a/test_unstructured_ingest/evaluation-metrics.sh
+++ b/test_unstructured_ingest/evaluation-metrics.sh
@ -13,7 +13,7 @@ mkdir -p "$OUTPUT_DIR"
 EVAL_NAME="$1"

 if [ "$EVAL_NAME" == "text-extraction" ]; then
-  METRIC_STRATEGY="measure-text-edit-distance-command"
+  METRIC_STRATEGY="measure-text-extraction-accuracy-command"
 elif [ "$EVAL_NAME" == "element-type" ]; then
  METRIC_STRATEGY="measure-element-type-accuracy-command"
 else
--- a/unstructured/ingest/evaluate.py
+++ b/unstructured/ingest/evaluate.py
@ -4,7 +4,10 @@ from typing import List, Optional, Tuple

 import click

-from unstructured.metrics.evaluate import measure_element_type_accuracy, measure_text_edit_distance
+from unstructured.metrics.evaluate import (
+    measure_element_type_accuracy,
+    measure_text_extraction_accuracy,
+)


@click.group()
@ -42,10 +45,17 @@ def main():
    type=(int, int, int),
    default=(2, 1, 1),
    show_default=True,
-    help="A tuple of weights to the Levenshtein distance calculation. \
+    help="A list of weights to the Levenshtein distance calculation. Takes input as --weights 2 2 2\
        See text_extraction.py/calculate_edit_distance for more details.",
 )
-def measure_text_edit_distance_command(
+@click.option(
+    "--visualize",
+    is_flag=True,
+    show_default=True,
+    default=False,
+    help="Add the flag to show progress bar.",
+)
+def measure_text_extraction_accuracy_command(
    output_dir: str,
    source_dir: str,
    output_list: Optional[List[str]],
@ -53,9 +63,10 @@ def measure_text_edit_distance_command(
    export_dir: str,
    grouping: Optional[str],
    weights: Tuple[int, int, int],
+    visualize: bool,
 ):
-    return measure_text_edit_distance(
-        output_dir, source_dir, output_list, source_list, export_dir, grouping, weights
+    return measure_text_extraction_accuracy(
+        output_dir, source_dir, output_list, source_list, export_dir, grouping, weights, visualize
    )


@ -83,15 +94,23 @@ def measure_text_edit_distance_command(
    help="Directory to save the output evaluation metrics to. Default to \
        your/working/dir/metrics/",
 )
+@click.option(
+    "--visualize",
+    is_flag=True,
+    show_default=True,
+    default=False,
+    help="Add the flag to show progress bar.",
+)
 def measure_element_type_accuracy_command(
    output_dir: str,
    source_dir: str,
    output_list: Optional[List[str]],
    source_list: Optional[List[str]],
    export_dir: str,
+    visualize: bool,
 ):
    return measure_element_type_accuracy(
-        output_dir, source_dir, output_list, source_list, export_dir
+        output_dir, source_dir, output_list, source_list, export_dir, visualize
    )


--- a/unstructured/metrics/doctype_aggregation.py
+++ b/unstructured/metrics/doctype_aggregation.py
@ -1,81 +0,0 @@
-import os
-from typing import List, Optional, Tuple
-
-import click
-import pandas as pd
-
-from unstructured.metrics.evaluate import measure_text_edit_distance
-
-
-@click.group()
-def main():
-    pass
-
-
-def aggregate_cct_data_by_doctype(results_dir: str):
-    # load tsv into dataframe
-    df = pd.read_csv(os.path.join(results_dir, "all-docs-cct.tsv"), sep="\t", header=0)
-
-    # group by doctype and calculate stats
-    agg_df = df.groupby("doctype").agg(
-        {"cct-accuracy": ["mean", "std", "count"], "cct-%missing": ["mean", "std", "count"]}
-    )
-
-    # write results to same export results folder
-    agg_df.to_csv(os.path.join(results_dir, "all-doctypes-agg-cct.tsv"))
-
-
-@main.command()
-@click.option("--output_dir", type=str, help="Directory to structured output.")
-@click.option("--source_dir", type=str, help="Directory to source.")
-@click.option(
-    "--output_list",
-    type=str,
-    multiple=True,
-    help="Optional: list of selected structured output file names under the \
-        directory to be evaluate. If none, all files under directory will be use.",
-)
-@click.option(
-    "--source_list",
-    type=str,
-    multiple=True,
-    help="Optional: list of selected source file names under the directory \
-        to be evaluate. If none, all files under directory will be use.",
-)
-@click.option(
-    "--export_dir",
-    type=str,
-    default="metrics_results",
-    help="Directory to save the output evaluation metrics to. Default to \
-        [your_working_dir]/metrics_results/",
-)
-@click.option(
-    "--weights",
-    type=(int, int, int),
-    default=(2, 1, 1),
-    show_default=True,
-    help="A tuple of weights to the Levenshtein distance calculation. \
-        See text_extraction.py/calculate_edit_distance for more details.",
-)
-def measure_holistic_eval_cct(
-    output_dir: str,
-    source_dir: str,
-    output_list: Optional[List[str]],
-    source_list: Optional[List[str]],
-    export_dir: str,
-    weights: Tuple[int, int, int],
-) -> None:
-    export_dir = "result_doctype_aggregate"
-    measure_text_edit_distance(
-        output_dir=output_dir,
-        source_dir=source_dir,
-        output_list=output_list,
-        source_list=source_list,
-        export_dir=export_dir,
-        weights=weights,
-    )
-    aggregate_cct_data_by_doctype(export_dir)
-
-
-if __name__ == "__main__":
-    main()
--- a/unstructured/metrics/evaluate.py
+++ b/unstructured/metrics/evaluate.py
@ -33,7 +33,7 @@ logger.setLevel(logging.DEBUG)
 agg_headers = ["metric", "average", "sample_sd", "population_sd", "count"]


-def measure_text_edit_distance(
+def measure_text_extraction_accuracy(
    output_dir: str,
    source_dir: str,
    output_list: Optional[List[str]] = None,