ci: parametrize ingest test checking scripts (#2062)

- parametrize the output folder paths and expected output folder paths
in comparison scripts
- now allow user to use env `OUTPUT_ROOT` to control where the output
and expected output is
- currently assumes output from test and expected output are in the same
directory; this may need separation later

## test
run
```bash
OUTPUT_ROOT=/tmp ./test_unstructured_ingest/test-ingest-src.sh
```
and it should show files changed but not able to show diff since there
is no expected output content at `OUTPUT_ROOT`.

Then run
```bash
cp -R test_unstructured_ingest/expected-* /tmp/
OUTPUT_ROOT=/tmp ./test_unstructured_ingest/test-ingest-src.sh
```
we can see (due to CI and local instance producing different results)
actual line by line diff
This commit is contained in:
Yao You 2023-11-13 12:42:19 -06:00 committed by GitHub
parent 1ead5a27df
commit 36c4441e2b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 25 additions and 19 deletions

View File

@ -8,7 +8,6 @@
* **Include `languages` in metadata when partitioning strategy='hi_res' or 'fast'** User defined `languages` was previously used for text detection, but not included in the resulting element metadata for some strategies. `languages` will now be included in the metadata regardless of partition strategy for pdfs and images. * **Include `languages` in metadata when partitioning strategy='hi_res' or 'fast'** User defined `languages` was previously used for text detection, but not included in the resulting element metadata for some strategies. `languages` will now be included in the metadata regardless of partition strategy for pdfs and images.
## 0.10.30 ## 0.10.30
### Enhancements ### Enhancements

View File

@ -16,10 +16,11 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")")
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false} OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
TMP_DIRECTORY_CLEANUP=${TMP_DIRECTORY_CLEANUP:-true} TMP_DIRECTORY_CLEANUP=${TMP_DIRECTORY_CLEANUP:-true}
OUTPUT_FOLDER_NAME=$1 OUTPUT_FOLDER_NAME=$1
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR_TEXT=$SCRIPT_DIR/text-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR_TEXT=$OUTPUT_ROOT/text-output/$OUTPUT_FOLDER_NAME
EXPECTED_OUTPUT_DIR_TEXT=$SCRIPT_DIR/expected-text-output/$OUTPUT_FOLDER_NAME EXPECTED_OUTPUT_DIR=$OUTPUT_ROOT/expected-structured-output/$OUTPUT_FOLDER_NAME
EXPECTED_OUTPUT_DIR_TEXT=$OUTPUT_ROOT/expected-text-output/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091 # shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh source "$SCRIPT_DIR"/cleanup.sh

View File

@ -11,7 +11,8 @@ set +e
EXPECTED_NUM_DIRS=$1 EXPECTED_NUM_DIRS=$1
OUTPUT_FOLDER_NAME=$2 OUTPUT_FOLDER_NAME=$2
SCRIPT_DIR=$(dirname "$(realpath "$0")") SCRIPT_DIR=$(dirname "$(realpath "$0")")
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
NUMBER_OF_FOUND_DIRS="$(find "$OUTPUT_DIR" -type d -exec printf '.' \; | wc -c | xargs)" NUMBER_OF_FOUND_DIRS="$(find "$OUTPUT_DIR" -type d -exec printf '.' \; | wc -c | xargs)"

View File

@ -13,7 +13,8 @@ EXPECTED_NUM_FILES=$1
OUTPUT_FOLDER_NAME=$2 OUTPUT_FOLDER_NAME=$2
EXPECTED_SIZE=$3 EXPECTED_SIZE=$3
SCRIPT_DIR=$(dirname "$(realpath "$0")") SCRIPT_DIR=$(dirname "$(realpath "$0")")
EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
EXPECTED_OUTPUT_DIR=$OUTPUT_ROOT/expected-structured-output/$OUTPUT_FOLDER_NAME
NUM_FILES=$(find "$EXPECTED_OUTPUT_DIR" -type f -size +"$EXPECTED_SIZE" | wc -l) NUM_FILES=$(find "$EXPECTED_OUTPUT_DIR" -type f -size +"$EXPECTED_SIZE" | wc -l)
# Note: single brackets and "-ne" operator were necessary for evaluation in CI # Note: single brackets and "-ne" operator were necessary for evaluation in CI

View File

@ -11,7 +11,8 @@ set +e
EXPECTED_NUM_FILES=$1 EXPECTED_NUM_FILES=$1
OUTPUT_FOLDER_NAME=$2 OUTPUT_FOLDER_NAME=$2
SCRIPT_DIR=$(dirname "$(realpath "$0")") SCRIPT_DIR=$(dirname "$(realpath "$0")")
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
num_files_created="$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)" num_files_created="$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)"
# Note: single brackets and "-ne" operator were necessary for evaluation in CI # Note: single brackets and "-ne" operator were necessary for evaluation in CI

View File

@ -9,7 +9,8 @@ OUTPUT_DIR=$1
OUTPUT_FOLDER_NAME=$2 OUTPUT_FOLDER_NAME=$2
structured_outputs=("$OUTPUT_DIR"/*) structured_outputs=("$OUTPUT_DIR"/*)
CP_DIR=$SCRIPT_DIR/structured-output-eval/$OUTPUT_FOLDER_NAME OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
CP_DIR=$OUTPUT_ROOT/structured-output-eval/$OUTPUT_FOLDER_NAME
mkdir -p "$CP_DIR" mkdir -p "$CP_DIR"
selected_outputs=$(cat "$SCRIPT_DIR/metrics/metrics-json-manifest.txt") selected_outputs=$(cat "$SCRIPT_DIR/metrics/metrics-json-manifest.txt")

View File

@ -6,7 +6,8 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")")
cd "$SCRIPT_DIR"/.. || exit 1 cd "$SCRIPT_DIR"/.. || exit 1
# List all structured outputs to use in this evaluation # List all structured outputs to use in this evaluation
OUTPUT_DIR=$SCRIPT_DIR/structured-output-eval OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output-eval
mkdir -p "$OUTPUT_DIR" mkdir -p "$OUTPUT_DIR"
EVAL_NAME="$1" EVAL_NAME="$1"
@ -23,7 +24,7 @@ fi
# Download cct test from s3 # Download cct test from s3
BUCKET_NAME=utic-dev-tech-fixtures BUCKET_NAME=utic-dev-tech-fixtures
FOLDER_NAME=small-eval-"$EVAL_NAME" FOLDER_NAME=small-eval-"$EVAL_NAME"
SOURCE_DIR=$SCRIPT_DIR/gold-standard/$FOLDER_NAME SOURCE_DIR=$OUTPUT_ROOT/gold-standard/$FOLDER_NAME
mkdir -p "$SOURCE_DIR" mkdir -p "$SOURCE_DIR"
aws s3 cp "s3://$BUCKET_NAME/$FOLDER_NAME" "$SOURCE_DIR" --recursive --no-sign-request --region us-east-2 aws s3 cp "s3://$BUCKET_NAME/$FOLDER_NAME" "$SOURCE_DIR" --recursive --no-sign-request --region us-east-2

View File

@ -5,10 +5,11 @@ set -e
SCRIPT_DIR=$(dirname "$(realpath "$0")") SCRIPT_DIR=$(dirname "$(realpath "$0")")
cd "$SCRIPT_DIR"/.. || exit 1 cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=delta-table-dest OUTPUT_FOLDER_NAME=delta-table-dest
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest DESTINATION_TABLE=$OUTPUT_ROOT/delta-table-dest
CI=${CI:-"false"} CI=${CI:-"false"}
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then