76 lines
2.4 KiB
Bash
Raw Permalink Normal View History

#!/usr/bin/env bash
set -e
DEST_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$DEST_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=mongodb-dest
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
DESTINATION_MONGO_COLLECTION="utic-test-ingest-fixtures-output-$(uuidgen)"
CI=${CI:-"false"}
if [ -z "$MONGODB_URI" ] && [ -z "$MONGODB_DATABASE_NAME" ]; then
echo "Skipping MongoDB destination ingest test because the MONGODB_URI and MONGODB_DATABASE_NAME env var are not set."
exit 8
fi
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
--uri "$MONGODB_URI" \
--database "$MONGODB_DATABASE_NAME" \
--collection "$DESTINATION_MONGO_COLLECTION" down
}
trap cleanup EXIT
# NOTE(robinson) - per pymongo docs, pymongo ships with its own version of the bson library,
# which is incompatible with the bson installed from pypi. bson is installed as part of the
# astra dependencies.
# ref: https://pymongo.readthedocs.io/en/stable/installation.html
pip uninstall -y bson pymongo
make install-ingest-mongodb
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
--uri "$MONGODB_URI" \
--database "$MONGODB_DATABASE_NAME" \
--collection "$DESTINATION_MONGO_COLLECTION" up
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
--reprocess \
--input-path example-docs/fake-memo.pdf \
--work-dir "$WORK_DIR" \
--embedding-provider "langchain-huggingface" \
mongodb \
--uri "$MONGODB_URI" \
--database "$MONGODB_DATABASE_NAME" \
--collection "$DESTINATION_MONGO_COLLECTION"
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
--uri "$MONGODB_URI" \
--database "$MONGODB_DATABASE_NAME" \
--collection "$DESTINATION_MONGO_COLLECTION" \
check --expected-records 5
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
--uri "$MONGODB_URI" \
--database "$MONGODB_DATABASE_NAME" \
--collection "$DESTINATION_MONGO_COLLECTION" \
check-vector \
--output-json "$OUTPUT_ROOT"/structured-output/$OUTPUT_FOLDER_NAME/fake-memo.pdf.json