mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-23 09:00:40 +00:00

**Executive Summary** This PR adds the evaluation metrics to our current workflow. It verifies the flow that when the code is pushed, the code will gets evaluate against our gold standard and output into `.tsv` file. **Technical Details** - Adds evaluation metrics to the test-ingest workflow - Make use of `structured-output` from `test-ingest` and compare to the gold-standard uploaded in s3, and download into local when make comparison. The current folder in-use is `s3://utic-dev-tech-fixtures/small-cct`. This dir is editable in the shell script. - With this PR, only one file from one connector is use to compare. **Misc** - Not many overlapped files between test-ingest and gold-standard. More files will be added. **Outputs** 2 `.tsv` files are saved under `test_unstructured_ingest/metrics/`.   --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
56 lines
1.8 KiB
Bash
Executable File
56 lines
1.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# Set either BOX_APP_CONFIG (app config json content as string) or
|
|
# BOX_APP_CONFIG_PATH (path to app config json file) env var
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
OUTPUT_FOLDER_NAME=box
|
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
|
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
|
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
|
CI=${CI:-"false"}
|
|
|
|
# shellcheck disable=SC1091
|
|
source "$SCRIPT_DIR"/cleanup.sh
|
|
function cleanup() {
|
|
cleanup_dir "$OUTPUT_DIR"
|
|
cleanup_dir "$WORK_DIR"
|
|
if [ "$CI" == "true" ]; then
|
|
cleanup_dir "$DOWNLOAD_DIR"
|
|
fi
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then
|
|
echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set."
|
|
exit 0
|
|
fi
|
|
|
|
if [ -z "$BOX_APP_CONFIG_PATH" ]; then
|
|
# Create temporary service key file
|
|
BOX_APP_CONFIG_PATH=$(mktemp)
|
|
echo "$BOX_APP_CONFIG" >"$BOX_APP_CONFIG_PATH"
|
|
fi
|
|
|
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
|
box \
|
|
--download-dir "$DOWNLOAD_DIR" \
|
|
--box-app-config "$BOX_APP_CONFIG_PATH" \
|
|
--remote-url box://utic-test-ingest-fixtures \
|
|
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
|
--output-dir "$OUTPUT_DIR" \
|
|
--num-processes "$max_processes" \
|
|
--preserve-downloads \
|
|
--recursive \
|
|
--reprocess \
|
|
--verbose \
|
|
--work-dir "$WORK_DIR"
|
|
|
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
|
|
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|