mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-16 20:57:50 +00:00
ci: parametrize ingest test checking scripts (#2062)
- parametrize the output folder paths and expected output folder paths in comparison scripts - now allow user to use env `OUTPUT_ROOT` to control where the output and expected output is - currently assumes output from test and expected output are in the same directory; this may need separation later ## test run ```bash OUTPUT_ROOT=/tmp ./test_unstructured_ingest/test-ingest-src.sh ``` and it should show files changed but not able to show diff since there is no expected output content at `OUTPUT_ROOT`. Then run ```bash cp -R test_unstructured_ingest/expected-* /tmp/ OUTPUT_ROOT=/tmp ./test_unstructured_ingest/test-ingest-src.sh ``` we can see (due to CI and local instance producing different results) actual line by line diff
This commit is contained in:
parent
1ead5a27df
commit
36c4441e2b
@ -8,7 +8,6 @@
|
||||
|
||||
* **Include `languages` in metadata when partitioning strategy='hi_res' or 'fast'** User defined `languages` was previously used for text detection, but not included in the resulting element metadata for some strategies. `languages` will now be included in the metadata regardless of partition strategy for pdfs and images.
|
||||
|
||||
|
||||
## 0.10.30
|
||||
|
||||
### Enhancements
|
||||
|
@ -16,10 +16,11 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||
TMP_DIRECTORY_CLEANUP=${TMP_DIRECTORY_CLEANUP:-true}
|
||||
OUTPUT_FOLDER_NAME=$1
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
OUTPUT_DIR_TEXT=$SCRIPT_DIR/text-output/$OUTPUT_FOLDER_NAME
|
||||
EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME
|
||||
EXPECTED_OUTPUT_DIR_TEXT=$SCRIPT_DIR/expected-text-output/$OUTPUT_FOLDER_NAME
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
OUTPUT_DIR_TEXT=$OUTPUT_ROOT/text-output/$OUTPUT_FOLDER_NAME
|
||||
EXPECTED_OUTPUT_DIR=$OUTPUT_ROOT/expected-structured-output/$OUTPUT_FOLDER_NAME
|
||||
EXPECTED_OUTPUT_DIR_TEXT=$OUTPUT_ROOT/expected-text-output/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
|
@ -11,7 +11,8 @@ set +e
|
||||
EXPECTED_NUM_DIRS=$1
|
||||
OUTPUT_FOLDER_NAME=$2
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
|
||||
|
||||
NUMBER_OF_FOUND_DIRS="$(find "$OUTPUT_DIR" -type d -exec printf '.' \; | wc -c | xargs)"
|
||||
|
@ -13,7 +13,8 @@ EXPECTED_NUM_FILES=$1
|
||||
OUTPUT_FOLDER_NAME=$2
|
||||
EXPECTED_SIZE=$3
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
EXPECTED_OUTPUT_DIR=$OUTPUT_ROOT/expected-structured-output/$OUTPUT_FOLDER_NAME
|
||||
NUM_FILES=$(find "$EXPECTED_OUTPUT_DIR" -type f -size +"$EXPECTED_SIZE" | wc -l)
|
||||
|
||||
# Note: single brackets and "-ne" operator were necessary for evaluation in CI
|
||||
|
@ -11,7 +11,8 @@ set +e
|
||||
EXPECTED_NUM_FILES=$1
|
||||
OUTPUT_FOLDER_NAME=$2
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
num_files_created="$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)"
|
||||
|
||||
# Note: single brackets and "-ne" operator were necessary for evaluation in CI
|
||||
|
@ -9,7 +9,8 @@ OUTPUT_DIR=$1
|
||||
OUTPUT_FOLDER_NAME=$2
|
||||
structured_outputs=("$OUTPUT_DIR"/*)
|
||||
|
||||
CP_DIR=$SCRIPT_DIR/structured-output-eval/$OUTPUT_FOLDER_NAME
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
CP_DIR=$OUTPUT_ROOT/structured-output-eval/$OUTPUT_FOLDER_NAME
|
||||
mkdir -p "$CP_DIR"
|
||||
|
||||
selected_outputs=$(cat "$SCRIPT_DIR/metrics/metrics-json-manifest.txt")
|
||||
|
@ -6,7 +6,8 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
|
||||
# List all structured outputs to use in this evaluation
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output-eval
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output-eval
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
EVAL_NAME="$1"
|
||||
@ -23,7 +24,7 @@ fi
|
||||
# Download cct test from s3
|
||||
BUCKET_NAME=utic-dev-tech-fixtures
|
||||
FOLDER_NAME=small-eval-"$EVAL_NAME"
|
||||
SOURCE_DIR=$SCRIPT_DIR/gold-standard/$FOLDER_NAME
|
||||
SOURCE_DIR=$OUTPUT_ROOT/gold-standard/$FOLDER_NAME
|
||||
mkdir -p "$SOURCE_DIR"
|
||||
aws s3 cp "s3://$BUCKET_NAME/$FOLDER_NAME" "$SOURCE_DIR" --recursive --no-sign-request --region us-east-2
|
||||
|
||||
|
@ -5,10 +5,11 @@ set -e
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=delta-table-dest
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
|
||||
DESTINATION_TABLE=$OUTPUT_ROOT/delta-table-dest
|
||||
CI=${CI:-"false"}
|
||||
|
||||
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
|
||||
|
Loading…
x
Reference in New Issue
Block a user