mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-24 09:26:08 +00:00

### Description In an effort to mitigate resource consumption when running CI tests, cleanup download dir for ingest tests after each one.
48 lines
1.6 KiB
Bash
Executable File
48 lines
1.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
OUTPUT_FOLDER_NAME=google-drive
|
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
|
CI=${CI:-"false"}
|
|
|
|
# shellcheck disable=SC1091
|
|
source "$SCRIPT_DIR"/cleanup.sh
|
|
function cleanup() {
|
|
cleanup_dir "$OUTPUT_DIR"
|
|
if [ "$CI" == "true" ]; then
|
|
cleanup_dir "$DOWNLOAD_DIR"
|
|
fi
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
|
|
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
|
|
echo "The Google Drive test content can be found at https://drive.google.com/drive/folders/1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr"
|
|
exit 0
|
|
fi
|
|
|
|
# Create temporary service key file
|
|
GCP_INGEST_SERVICE_KEY_FILE=$(mktemp)
|
|
echo "$GCP_INGEST_SERVICE_KEY" >"$GCP_INGEST_SERVICE_KEY_FILE"
|
|
|
|
PYTHONPATH=. unstructured/ingest/main.py \
|
|
google-drive \
|
|
--download-dir "$DOWNLOAD_DIR" \
|
|
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
|
--num-processes "$max_processes" \
|
|
--strategy hi_res \
|
|
--preserve-downloads \
|
|
--reprocess \
|
|
--output-dir "$OUTPUT_DIR" \
|
|
--verbose \
|
|
--drive-id 1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr \
|
|
--service-account-key "$GCP_INGEST_SERVICE_KEY_FILE"
|
|
|
|
|
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|