mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

**Executive Summary** This PR adds the evaluation metrics to our current workflow. It verifies the flow that when the code is pushed, the code will gets evaluate against our gold standard and output into `.tsv` file. **Technical Details** - Adds evaluation metrics to the test-ingest workflow - Make use of `structured-output` from `test-ingest` and compare to the gold-standard uploaded in s3, and download into local when make comparison. The current folder in-use is `s3://utic-dev-tech-fixtures/small-cct`. This dir is editable in the shell script. - With this PR, only one file from one connector is use to compare. **Misc** - Not many overlapped files between test-ingest and gold-standard. More files will be added. **Outputs** 2 `.tsv` files are saved under `test_unstructured_ingest/metrics/`.   --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
53 lines
1.8 KiB
Bash
Executable File
53 lines
1.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
OUTPUT_FOLDER_NAME=dropbox
|
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
|
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
|
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
|
CI=${CI:-"false"}
|
|
|
|
# shellcheck disable=SC1091
|
|
source "$SCRIPT_DIR"/cleanup.sh
|
|
function cleanup() {
|
|
cleanup_dir "$OUTPUT_DIR"
|
|
cleanup_dir "$WORK_DIR"
|
|
if [ "$CI" == "true" ]; then
|
|
cleanup_dir "$DOWNLOAD_DIR"
|
|
fi
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then
|
|
echo "Skipping Dropbox ingest test because one or more of these env vars is not set:"
|
|
echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN"
|
|
exit 0
|
|
fi
|
|
|
|
# Get a new access token from Dropbox
|
|
DROPBOX_RESPONSE=$(curl https://api.dropbox.com/oauth2/token -d refresh_token="$DROPBOX_REFRESH_TOKEN" -d grant_type=refresh_token -d client_id="$DROPBOX_APP_KEY" -d client_secret="$DROPBOX_APP_SECRET")
|
|
DROPBOX_ACCESS_TOKEN=$(jq -r '.access_token' <<< "$DROPBOX_RESPONSE")
|
|
|
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
|
dropbox \
|
|
--num-processes "$max_processes" \
|
|
--download-dir "$DOWNLOAD_DIR" \
|
|
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
|
--preserve-downloads \
|
|
--reprocess \
|
|
--output-dir "$OUTPUT_DIR" \
|
|
--verbose \
|
|
--token "$DROPBOX_ACCESS_TOKEN" \
|
|
--recursive \
|
|
--remote-url "dropbox:// /" \
|
|
--work-dir "$WORK_DIR"
|
|
|
|
|
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
|
|
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|