mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-22 15:50:22 +00:00

### Description Often times there are tests being skipped either due to missing env vars or explicitly defined in the base script but these get lost in the logs. This PR updates the scripts to leverage a custom error code if being skipped due to missing env vars and this custom error code is being caught by the base script and logs all files being skipped to a file. At the end of the script, this file gets logged in the CI output.
62 lines
1.7 KiB
Bash
Executable File
62 lines
1.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -e
|
|
|
|
DEST_PATH=$(dirname "$(realpath "$0")")
|
|
SCRIPT_DIR=$(dirname "$DEST_PATH")
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
OUTPUT_FOLDER_NAME=gcs-dest
|
|
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
|
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
|
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
|
BUCKET="utic-test-ingest-fixtures-output"
|
|
DIRECTORY=$(uuidgen)
|
|
DESTINATION_GCS="gs://$BUCKET/$DIRECTORY"
|
|
CI=${CI:-"false"}
|
|
|
|
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
|
|
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
|
|
exit 8
|
|
fi
|
|
|
|
# Create temporary service key file
|
|
GCP_INGEST_SERVICE_KEY_FILE=$(mktemp)
|
|
echo "$GCP_INGEST_SERVICE_KEY" > "$GCP_INGEST_SERVICE_KEY_FILE"
|
|
|
|
# shellcheck disable=SC1091
|
|
source "$SCRIPT_DIR"/cleanup.sh
|
|
function cleanup() {
|
|
cleanup_dir "$OUTPUT_DIR"
|
|
cleanup_dir "$WORK_DIR"
|
|
|
|
python "$SCRIPT_DIR"/python/test-gcs-output.py down \
|
|
--service-account-file "$GCP_INGEST_SERVICE_KEY_FILE" \
|
|
--bucket "$BUCKET" \
|
|
--blob-path "$DIRECTORY"
|
|
|
|
}
|
|
|
|
trap cleanup EXIT
|
|
|
|
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
|
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
|
local \
|
|
--num-processes "$max_processes" \
|
|
--output-dir "$OUTPUT_DIR" \
|
|
--strategy fast \
|
|
--verbose \
|
|
--reprocess \
|
|
--input-path example-docs/fake-memo.pdf \
|
|
--work-dir "$WORK_DIR" \
|
|
gcs \
|
|
--service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \
|
|
--remote-url "$DESTINATION_GCS"
|
|
|
|
# Simply check the number of files uploaded
|
|
python "$SCRIPT_DIR"/python/test-gcs-output.py check \
|
|
--expected-files 1 \
|
|
--service-account-file "$GCP_INGEST_SERVICE_KEY_FILE" \
|
|
--bucket "$BUCKET" \
|
|
--blob-path "$DIRECTORY"
|