mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-19 06:09:32 +00:00

### Description Often times there are tests being skipped either due to missing env vars or explicitly defined in the base script but these get lost in the logs. This PR updates the scripts to leverage a custom error code if being skipped due to missing env vars and this custom error code is being caught by the base script and logs all files being skipped to a file. At the end of the script, this file gets logged in the CI output.
70 lines
2.1 KiB
Bash
Executable File
70 lines
2.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -e
|
|
|
|
DEST_PATH=$(dirname "$(realpath "$0")")
|
|
SCRIPT_DIR=$(dirname "$DEST_PATH")
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
OUTPUT_FOLDER_NAME=mongodb-dest
|
|
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
|
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
|
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
|
DESTINATION_MONGO_COLLECTION="utic-test-ingest-fixtures-output-$(uuidgen)"
|
|
CI=${CI:-"false"}
|
|
|
|
if [ -z "$MONGODB_URI" ] && [ -z "$MONGODB_DATABASE_NAME" ]; then
|
|
echo "Skipping MongoDB destination ingest test because the MONGODB_URI and MONGODB_DATABASE_NAME env var are not set."
|
|
exit 8
|
|
fi
|
|
|
|
|
|
# shellcheck disable=SC1091
|
|
source "$SCRIPT_DIR"/cleanup.sh
|
|
function cleanup() {
|
|
cleanup_dir "$OUTPUT_DIR"
|
|
cleanup_dir "$WORK_DIR"
|
|
|
|
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
|
--uri "$MONGODB_URI" \
|
|
--database "$MONGODB_DATABASE_NAME" \
|
|
--collection "$DESTINATION_MONGO_COLLECTION" down
|
|
|
|
}
|
|
|
|
trap cleanup EXIT
|
|
|
|
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
|
--uri "$MONGODB_URI" \
|
|
--database "$MONGODB_DATABASE_NAME" \
|
|
--collection "$DESTINATION_MONGO_COLLECTION" up
|
|
|
|
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
|
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
|
local \
|
|
--num-processes "$max_processes" \
|
|
--output-dir "$OUTPUT_DIR" \
|
|
--strategy fast \
|
|
--verbose \
|
|
--reprocess \
|
|
--input-path example-docs/fake-memo.pdf \
|
|
--work-dir "$WORK_DIR" \
|
|
--embedding-provider "langchain-huggingface" \
|
|
mongodb \
|
|
--uri "$MONGODB_URI" \
|
|
--database "$MONGODB_DATABASE_NAME" \
|
|
--collection "$DESTINATION_MONGO_COLLECTION"
|
|
|
|
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
|
--uri "$MONGODB_URI" \
|
|
--database "$MONGODB_DATABASE_NAME" \
|
|
--collection "$DESTINATION_MONGO_COLLECTION" \
|
|
check --expected-records 5
|
|
|
|
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
|
--uri "$MONGODB_URI" \
|
|
--database "$MONGODB_DATABASE_NAME" \
|
|
--collection "$DESTINATION_MONGO_COLLECTION" \
|
|
check-vector \
|
|
--output-json "$OUTPUT_ROOT"/structured-output/$OUTPUT_FOLDER_NAME/fake-memo.pdf.json
|