#!/usr/bin/env bash set -e DEST_PATH=$(dirname "$(realpath "$0")") SCRIPT_DIR=$(dirname "$DEST_PATH") cd "$SCRIPT_DIR"/.. || exit 1 OUTPUT_FOLDER_NAME=local-vectara-dest OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} RANDOM_SUFFIX=$((RANDOM % 100000 + 1)) CORPUS_NAME="test-corpus-vectara-"$RANDOM_SUFFIX # Expected size of the uploaded document EXPECTED_CORPUS_SIZE=8842684 if [ -z "$VECTARA_OAUTH_CLIENT_ID" ] && [ -z "$VECTARA_OAUTH_SECRET" ] && [ -z "$VECTARA_CUSTOMER_ID" ]; then echo "Skipping VECTARA ingest test because VECTARA_OAUTH_CLIENT_ID, VECTARA_OAUTH_SECRET, or VECTARA_CUSTOMER_ID env var is not set." exit 8 fi # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh function cleanup { echo "Deleting corpus $corpus_id ($CORPUS_NAME)" curl -sS -L -X POST 'https://api.vectara.io/v1/delete-corpus' \ -H 'Content-Type: application/json' \ -H 'Accept: application/json' \ -H "Authorization: Bearer $access_token" \ -H "customer-id: $VECTARA_CUSTOMER_ID" \ --data-raw "{ \"corpusId\": $corpus_id }" # Local file cleanup cleanup_dir "$WORK_DIR" cleanup_dir "$OUTPUT_DIR" } trap cleanup EXIT PYTHONPATH=. ./unstructured/ingest/main.py \ local \ --num-processes "$max_processes" \ --output-dir "$OUTPUT_DIR" \ --strategy fast \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1225p.txt \ --work-dir "$WORK_DIR" \ vectara \ --customer-id "$VECTARA_CUSTOMER_ID" \ --oauth-client-id "$VECTARA_OAUTH_CLIENT_ID" \ --oauth-secret "$VECTARA_OAUTH_SECRET" \ --corpus-name "$CORPUS_NAME" # Get JWT token jwt_token_resp=$(curl -sS -XPOST -H "Content-type: application/x-www-form-urlencoded" -d \ "grant_type=client_credentials&client_id=$VECTARA_OAUTH_CLIENT_ID&client_secret=$VECTARA_OAUTH_SECRET" \ "https://vectara-prod-$VECTARA_CUSTOMER_ID.auth.us-west-2.amazoncognito.com/oauth2/token") access_token=$(echo "$jwt_token_resp" | jq -r '.access_token') # Get corpus ID from name corpora_resp=$(curl -sS -L -X POST 'https://api.vectara.io/v1/list-corpora' \ -H 'Content-Type: application/json' \ -H 'Accept: application/json' \ -H "customer-id: $VECTARA_CUSTOMER_ID" \ -H "Authorization: Bearer $access_token" \ --data-raw "{ \"numResults\": 100, \"filter\": \"$CORPUS_NAME\" }") corpus_id=$(echo "$corpora_resp" | jq -r '.corpus[0].id') # Check that the size of the corpus is as expected get_corpus_size=$(curl -L -X POST 'https://api.vectara.io/v1/compute-corpus-size' \ -H 'Content-Type: application/json' \ -H 'Accept: application/json' \ -H "customer-id: $VECTARA_CUSTOMER_ID" \ -H "Authorization: Bearer $access_token" \ --data-raw "{ \"corpusId\": $corpus_id }") corpus_size=$(echo "$get_corpus_size" | jq -r '.size.size') if [ "$corpus_size" == "$EXPECTED_CORPUS_SIZE" ]; then echo "Corpus size is as expected: $corpus_size" else echo "Corpus size is not as expected: $corpus_size" echo "vs $EXPECTED_CORPUS_SIZE" exit 1 fi