diff --git a/CHANGELOG.md b/CHANGELOG.md index 72c63e62d..5f74285f7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ### Enhancements +* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance. + ### Features ### Fixes diff --git a/test_unstructured_ingest/evaluation-ingest-cp.sh b/test_unstructured_ingest/evaluation-ingest-cp.sh index c8dd56053..46c78d101 100755 --- a/test_unstructured_ingest/evaluation-ingest-cp.sh +++ b/test_unstructured_ingest/evaluation-ingest-cp.sh @@ -17,7 +17,7 @@ selected_outputs=$(cat "$SCRIPT_DIR/metrics/metrics-json-manifest.txt") # If structured output file in this connector's outputs match the # selected outputs in the txt file, copy to the destination for file in "${structured_outputs[@]}"; do - if [[ "${selected_outputs[*]}" =~ $(basename "$file") ]] ; then + if [[ -f "$file" && "${selected_outputs[*]}" =~ $(basename "$file") ]] ; then echo "--- Copying $file to $CP_DIR ---" cp "$file" "$CP_DIR" fi diff --git a/test_unstructured_ingest/evaluation-metrics.sh b/test_unstructured_ingest/evaluation-metrics.sh index b8c48d04a..1e24912da 100755 --- a/test_unstructured_ingest/evaluation-metrics.sh +++ b/test_unstructured_ingest/evaluation-metrics.sh @@ -9,23 +9,36 @@ cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_DIR=$SCRIPT_DIR/structured-output-eval mkdir -p "$OUTPUT_DIR" +EVAL_NAME="$1" + # Download cct test from s3 BUCKET_NAME=utic-dev-tech-fixtures -FOLDER_NAME=small-cct -CCT_DIR=$SCRIPT_DIR/gold-standard/$FOLDER_NAME -mkdir -p "$CCT_DIR" -aws s3 cp "s3://$BUCKET_NAME/$FOLDER_NAME" "$CCT_DIR" --recursive --no-sign-request --region us-east-2 +FOLDER_NAME=small-eval-"$EVAL_NAME" +LOCAL_EVAL_SOURCE_DIR=$SCRIPT_DIR/gold-standard/$FOLDER_NAME +mkdir -p "$LOCAL_EVAL_SOURCE_DIR" +aws s3 cp "s3://$BUCKET_NAME/$FOLDER_NAME" "$LOCAL_EVAL_SOURCE_DIR" --recursive --no-sign-request --region us-east-2 + +EXPORT_DIR="$SCRIPT_DIR"/metrics # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$OUTPUT_DIR" - cleanup_dir "$CCT_DIR" + cleanup_dir "$LOCAL_EVAL_SOURCE_DIR" } trap cleanup EXIT -EXPORT_DIR="$SCRIPT_DIR"/metrics +if [ "$EVAL_NAME" == "text-extraction" ]; then + STRATEGY="measure-text-edit-distance" +elif [ "$EVAL_NAME" == "element-type" ]; then + STRATEGY="measure-element-type-accuracy" +else + echo "Wrong evaluation strategy given. Got [ $EVAL_NAME ]." + exit 1 +fi + PYTHONPATH=. ./unstructured/ingest/evaluate.py \ + $STRATEGY \ --output_dir "$OUTPUT_DIR" \ - --source_dir "$CCT_DIR" \ - --export_dir "$EXPORT_DIR" + --source_dir "$LOCAL_EVAL_SOURCE_DIR" \ + --export_dir "$EXPORT_DIR" \ No newline at end of file diff --git a/test_unstructured_ingest/metrics/.gitkeep b/test_unstructured_ingest/metrics/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv b/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv index 5ba6eecde..2553a0f90 100644 --- a/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv +++ b/test_unstructured_ingest/metrics/aggregate-scores-cct.tsv @@ -1,3 +1,3 @@ strategy average sample_sd population_sd count -cct-accuracy 0.774 0.124 0.087 2 -cct-%missing 0.065 0.035 0.025 2 +cct-accuracy 0.777 0.088 0.072 3 +cct-%missing 0.087 0.045 0.037 3 diff --git a/test_unstructured_ingest/metrics/aggregate-scores-element-type.tsv b/test_unstructured_ingest/metrics/aggregate-scores-element-type.tsv new file mode 100644 index 000000000..210817b98 --- /dev/null +++ b/test_unstructured_ingest/metrics/aggregate-scores-element-type.tsv @@ -0,0 +1,2 @@ +strategy average sample_sd population_sd count +element-type-accuracy 0 diff --git a/test_unstructured_ingest/metrics/all-docs-cct.tsv b/test_unstructured_ingest/metrics/all-docs-cct.tsv index f8141f60a..048c30d77 100644 --- a/test_unstructured_ingest/metrics/all-docs-cct.tsv +++ b/test_unstructured_ingest/metrics/all-docs-cct.tsv @@ -1,3 +1,4 @@ filename connector cct-accuracy cct-%missing +IRS-form-1987.pdf azure 0.783 0.13 example-10k.html local 0.686 0.04 science-exploration-1p.pptx box 0.861 0.09 diff --git a/test_unstructured_ingest/metrics/all-docs-element-type.tsv b/test_unstructured_ingest/metrics/all-docs-element-type.tsv new file mode 100644 index 000000000..50d494248 --- /dev/null +++ b/test_unstructured_ingest/metrics/all-docs-element-type.tsv @@ -0,0 +1 @@ +filename connector element-type-accuracy diff --git a/test_unstructured_ingest/test-ingest-against-api.sh b/test_unstructured_ingest/test-ingest-against-api.sh index d6d93f835..74a4c6824 100755 --- a/test_unstructured_ingest/test-ingest-against-api.sh +++ b/test_unstructured_ingest/test-ingest-against-api.sh @@ -37,3 +37,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-airtable-diff.sh b/test_unstructured_ingest/test-ingest-airtable-diff.sh index 77406ded2..b310ffcc3 100755 --- a/test_unstructured_ingest/test-ingest-airtable-diff.sh +++ b/test_unstructured_ingest/test-ingest-airtable-diff.sh @@ -48,3 +48,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --verbose "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-azure.sh b/test_unstructured_ingest/test-ingest-azure.sh index 378f021f6..2af96b65e 100755 --- a/test_unstructured_ingest/test-ingest-azure.sh +++ b/test_unstructured_ingest/test-ingest-azure.sh @@ -33,3 +33,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-biomed-api.sh b/test_unstructured_ingest/test-ingest-biomed-api.sh index acd724314..01041750f 100755 --- a/test_unstructured_ingest/test-ingest-biomed-api.sh +++ b/test_unstructured_ingest/test-ingest-biomed-api.sh @@ -39,3 +39,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-biomed-path.sh b/test_unstructured_ingest/test-ingest-biomed-path.sh index ac43dfe97..7cc479276 100755 --- a/test_unstructured_ingest/test-ingest-biomed-path.sh +++ b/test_unstructured_ingest/test-ingest-biomed-path.sh @@ -37,3 +37,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-confluence-diff.sh b/test_unstructured_ingest/test-ingest-confluence-diff.sh index 35619421d..adb76477b 100755 --- a/test_unstructured_ingest/test-ingest-confluence-diff.sh +++ b/test_unstructured_ingest/test-ingest-confluence-diff.sh @@ -45,3 +45,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-delta-table.sh b/test_unstructured_ingest/test-ingest-delta-table.sh index c4be1a25f..8efa37c4a 100755 --- a/test_unstructured_ingest/test-ingest-delta-table.sh +++ b/test_unstructured_ingest/test-ingest-delta-table.sh @@ -49,3 +49,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE" + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-discord.sh b/test_unstructured_ingest/test-ingest-discord.sh index b32b7df35..56f22672b 100755 --- a/test_unstructured_ingest/test-ingest-discord.sh +++ b/test_unstructured_ingest/test-ingest-discord.sh @@ -41,3 +41,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-elasticsearch.sh b/test_unstructured_ingest/test-ingest-elasticsearch.sh index a983c2781..6481187cb 100755 --- a/test_unstructured_ingest/test-ingest-elasticsearch.sh +++ b/test_unstructured_ingest/test-ingest-elasticsearch.sh @@ -48,3 +48,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-embed.sh b/test_unstructured_ingest/test-ingest-embed.sh index e5ed942be..b9c282096 100755 --- a/test_unstructured_ingest/test-ingest-embed.sh +++ b/test_unstructured_ingest/test-ingest-embed.sh @@ -34,3 +34,5 @@ set +e # once we have an alternative encoder that is deterministic, we test the diff here # until then just validating the file was created "$SCRIPT_DIR"/check-num-files-output.sh 1 "$OUTPUT_FOLDER_NAME" + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-gcs.sh b/test_unstructured_ingest/test-ingest-gcs.sh index 95ba89e44..4ce6cf227 100755 --- a/test_unstructured_ingest/test-ingest-gcs.sh +++ b/test_unstructured_ingest/test-ingest-gcs.sh @@ -47,3 +47,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-github.sh b/test_unstructured_ingest/test-ingest-github.sh index 31d11995e..0c8a4d904 100755 --- a/test_unstructured_ingest/test-ingest-github.sh +++ b/test_unstructured_ingest/test-ingest-github.sh @@ -52,3 +52,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ $ACCESS_TOKEN_FLAGS "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-gitlab.sh b/test_unstructured_ingest/test-ingest-gitlab.sh index 9f5003f68..35985fd5a 100755 --- a/test_unstructured_ingest/test-ingest-gitlab.sh +++ b/test_unstructured_ingest/test-ingest-gitlab.sh @@ -38,3 +38,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-google-drive.sh b/test_unstructured_ingest/test-ingest-google-drive.sh index 09db555ec..ffdcc4d8b 100755 --- a/test_unstructured_ingest/test-ingest-google-drive.sh +++ b/test_unstructured_ingest/test-ingest-google-drive.sh @@ -48,3 +48,5 @@ PYTHONPATH=. unstructured/ingest/main.py \ "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-jira.sh b/test_unstructured_ingest/test-ingest-jira.sh index f3646d9af..aa8668338 100755 --- a/test_unstructured_ingest/test-ingest-jira.sh +++ b/test_unstructured_ingest/test-ingest-jira.sh @@ -68,3 +68,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh b/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh index f409c9ad7..b662db687 100755 --- a/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh +++ b/test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh @@ -31,3 +31,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh b/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh index 9cc358598..b4ccbfe58 100755 --- a/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh +++ b/test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh @@ -33,3 +33,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-local-single-file.sh b/test_unstructured_ingest/test-ingest-local-single-file.sh index bd9f4fc34..c06660e73 100755 --- a/test_unstructured_ingest/test-ingest-local-single-file.sh +++ b/test_unstructured_ingest/test-ingest-local-single-file.sh @@ -32,3 +32,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-notion.sh b/test_unstructured_ingest/test-ingest-notion.sh index de7c73cbc..221cc580a 100755 --- a/test_unstructured_ingest/test-ingest-notion.sh +++ b/test_unstructured_ingest/test-ingest-notion.sh @@ -42,3 +42,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-onedrive.sh b/test_unstructured_ingest/test-ingest-onedrive.sh index 0cad6257f..cef497c6a 100755 --- a/test_unstructured_ingest/test-ingest-onedrive.sh +++ b/test_unstructured_ingest/test-ingest-onedrive.sh @@ -46,3 +46,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-outlook.sh b/test_unstructured_ingest/test-ingest-outlook.sh index 82b76bdcc..cbed5fa8d 100755 --- a/test_unstructured_ingest/test-ingest-outlook.sh +++ b/test_unstructured_ingest/test-ingest-outlook.sh @@ -46,3 +46,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh index d3e741c36..8e0175cd1 100755 --- a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh @@ -42,3 +42,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-s3-compression.sh b/test_unstructured_ingest/test-ingest-s3-compression.sh index aee21c1dc..a329c193a 100755 --- a/test_unstructured_ingest/test-ingest-s3-compression.sh +++ b/test_unstructured_ingest/test-ingest-s3-compression.sh @@ -35,3 +35,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --uncompress "$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-s3-minio.sh b/test_unstructured_ingest/test-ingest-s3-minio.sh index 0164cb9dd..8604e75ee 100755 --- a/test_unstructured_ingest/test-ingest-s3-minio.sh +++ b/test_unstructured_ingest/test-ingest-s3-minio.sh @@ -47,3 +47,5 @@ AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key PYTHONPATH=. ./u "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-s3.sh b/test_unstructured_ingest/test-ingest-s3.sh index e150e366d..483c6f21e 100755 --- a/test_unstructured_ingest/test-ingest-s3.sh +++ b/test_unstructured_ingest/test-ingest-s3.sh @@ -37,3 +37,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-salesforce.sh b/test_unstructured_ingest/test-ingest-salesforce.sh index bfaec6e64..547212d8a 100755 --- a/test_unstructured_ingest/test-ingest-salesforce.sh +++ b/test_unstructured_ingest/test-ingest-salesforce.sh @@ -58,3 +58,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-sharepoint.sh b/test_unstructured_ingest/test-ingest-sharepoint.sh index 504a3b83e..5f4c9712b 100755 --- a/test_unstructured_ingest/test-ingest-sharepoint.sh +++ b/test_unstructured_ingest/test-ingest-sharepoint.sh @@ -54,3 +54,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-slack.sh b/test_unstructured_ingest/test-ingest-slack.sh index ecc96994d..d4fab1664 100755 --- a/test_unstructured_ingest/test-ingest-slack.sh +++ b/test_unstructured_ingest/test-ingest-slack.sh @@ -44,3 +44,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest-wikipedia.sh b/test_unstructured_ingest/test-ingest-wikipedia.sh index 0f81060e8..253009146 100755 --- a/test_unstructured_ingest/test-ingest-wikipedia.sh +++ b/test_unstructured_ingest/test-ingest-wikipedia.sh @@ -35,3 +35,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --work-dir "$WORK_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index 52fbf7828..f7c7260e4 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -97,6 +97,13 @@ for test in "${all_tests[@]}"; do fi done -echo "--------- RUNNING SCRIPT evaluation-metrics.sh ---------" -./test_unstructured_ingest/evaluation-metrics.sh -echo "--------- FINISHED SCRIPT evaluation-metrics.sh ---------" +all_eval=( + 'text-extraction' + 'element-type' +) +for eval in "${all_eval[@]}"; do + CURRENT_TEST="$eval" + echo "--------- RUNNING SCRIPT $eval ---------" + ./test_unstructured_ingest/evaluation-metrics.sh "$eval" + echo "--------- FINISHED SCRIPT $eval ---------" +done \ No newline at end of file diff --git a/unstructured/ingest/evaluate.py b/unstructured/ingest/evaluate.py index bd3b63423..a4fb1adad 100755 --- a/unstructured/ingest/evaluate.py +++ b/unstructured/ingest/evaluate.py @@ -8,6 +8,10 @@ from typing import Any, List, Optional, Tuple import click +from unstructured.metrics.element_type import ( + calculate_element_type_percent_match, + get_element_type_frequency, +) from unstructured.metrics.text_extraction import calculate_accuracy, calculate_percent_missing_text from unstructured.staging.base import elements_from_json, elements_to_text @@ -24,7 +28,12 @@ if "ingest_log_handler" not in [h.name for h in logger.handlers]: logger.setLevel(logging.DEBUG) -@click.command() +@click.group() +def main(): + pass + + +@main.command() @click.option("--output_dir", type=click.STRING, help="Directory to a structured output.") @click.option( "--output_list", @@ -56,7 +65,7 @@ logger.setLevel(logging.DEBUG) help="A tuple of weights to the Levenshtein distance calculation. \ See text_extraction.py/calculate_edit_distance for more details.", ) -def measure_edit_distance( +def measure_text_edit_distance( output_dir: str, output_list: Optional[List[str]], source_dir: str, @@ -123,6 +132,74 @@ def measure_edit_distance( _display(agg_rows, headers) +@main.command() +@click.option("--output_dir", type=click.STRING, help="Directory to a structured output.") +@click.option( + "--output_list", + type=click.STRING, + multiple=True, + help="Optional: list of selected structured output file names under the \ + directory to be evaluate. If none, all files under directory will be use.", +) +@click.option("--source_dir", type=click.STRING, help="Directory to a structured source.") +@click.option( + "--source_list", + type=click.STRING, + multiple=True, + help="Optional: list of selected structured source file names under the directory \ + to be evaluate. If none, all files under directory will be use.", +) +@click.option( + "--export_dir", + type=click.STRING, + default="metrics", + help="Directory to save the output evaluation metrics to. Default to \ + [your_working_dir]/metrics/", +) +def measure_element_type_accuracy( + output_dir: str, + output_list: Optional[List[str]], + source_dir: str, + source_list: Optional[List[str]], + export_dir: str, +): + if not output_list: + output_list = _listdir_recursive(output_dir) + if not source_list: + source_list = _listdir_recursive(source_dir) + + rows = [] + accuracy_scores: List[float] = [] + + for doc in output_list: # type: ignore + fn = (doc.split("/")[-1]).split(".json")[0] + fn_json = fn + ".json" + connector = doc.split("/")[0] + if fn_json in source_list: # type: ignore + output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc))) + source = get_element_type_frequency(_read_text(os.path.join(source_dir, fn_json))) + accuracy = round(calculate_element_type_percent_match(output, source), 3) + rows.append([fn, connector, accuracy]) + accuracy_scores.append(accuracy) + + headers = ["filename", "connector", "element-type-accuracy"] + _write_to_file(export_dir, "all-docs-element-type.tsv", rows, headers) + + headers = ["strategy", "average", "sample_sd", "population_sd", "count"] + agg_rows = [] + agg_rows.append( + [ + "element-type-accuracy", + _mean(accuracy_scores), + _stdev(accuracy_scores), + _pstdev(accuracy_scores), + len(accuracy_scores), + ], + ) + _write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_rows, headers) + _display(agg_rows, headers) + + def _listdir_recursive(dir: str): listdir = [] for dirpath, _, filenames in os.walk(dir): @@ -164,9 +241,15 @@ def _display(rows, headers): def _mean(scores: List[float], rounding: Optional[int] = 3): + if len(scores) < 1: + return None + elif len(scores) == 1: + mean = scores[0] + else: + mean = statistics.mean(scores) if not rounding: - return statistics.mean(scores) - return round(statistics.mean(scores), rounding) + return mean + return round(mean, rounding) def _stdev(scores: List[float], rounding: Optional[int] = 3): @@ -185,5 +268,11 @@ def _pstdev(scores: List[float], rounding: Optional[int] = 3): return round(statistics.pstdev(scores), rounding) +def _read_text(path): + with open(path) as f: + text = f.read() + return text + + if __name__ == "__main__": - measure_edit_distance() + main()