mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-10 02:25:55 +00:00

### Summary Closes #3049. Reenables the MongoDB connector test, which was disabled previously in #3047 due to incompatibility between the `pymongo` and the `bson` package from `pip`, which is a dependency for the Astra connector. Per the `pymongo` docs below, `pymongo` ships with its own version of `bson` and installing `bson` from `pip` breaks `pymongo`. - https://pymongo.readthedocs.io/en/stable/installation.html ### Testing Ingest tests ran successfully for the [source connector](https://github.com/Unstructured-IO/unstructured/actions/runs/9273154676/job/25512636315) and the [destination connector](https://github.com/Unstructured-IO/unstructured/actions/runs/9273154676/job/25512635546).
76 lines
2.4 KiB
Bash
Executable File
76 lines
2.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -e
|
|
|
|
DEST_PATH=$(dirname "$(realpath "$0")")
|
|
SCRIPT_DIR=$(dirname "$DEST_PATH")
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
OUTPUT_FOLDER_NAME=mongodb-dest
|
|
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
|
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
|
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
|
DESTINATION_MONGO_COLLECTION="utic-test-ingest-fixtures-output-$(uuidgen)"
|
|
CI=${CI:-"false"}
|
|
|
|
if [ -z "$MONGODB_URI" ] && [ -z "$MONGODB_DATABASE_NAME" ]; then
|
|
echo "Skipping MongoDB destination ingest test because the MONGODB_URI and MONGODB_DATABASE_NAME env var are not set."
|
|
exit 8
|
|
fi
|
|
|
|
# shellcheck disable=SC1091
|
|
source "$SCRIPT_DIR"/cleanup.sh
|
|
function cleanup() {
|
|
cleanup_dir "$OUTPUT_DIR"
|
|
cleanup_dir "$WORK_DIR"
|
|
|
|
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
|
--uri "$MONGODB_URI" \
|
|
--database "$MONGODB_DATABASE_NAME" \
|
|
--collection "$DESTINATION_MONGO_COLLECTION" down
|
|
|
|
}
|
|
|
|
trap cleanup EXIT
|
|
|
|
# NOTE(robinson) - per pymongo docs, pymongo ships with its own version of the bson library,
|
|
# which is incompatible with the bson installed from pypi. bson is installed as part of the
|
|
# astra dependencies.
|
|
# ref: https://pymongo.readthedocs.io/en/stable/installation.html
|
|
pip uninstall -y bson pymongo
|
|
make install-ingest-mongodb
|
|
|
|
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
|
--uri "$MONGODB_URI" \
|
|
--database "$MONGODB_DATABASE_NAME" \
|
|
--collection "$DESTINATION_MONGO_COLLECTION" up
|
|
|
|
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
|
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
|
local \
|
|
--num-processes "$max_processes" \
|
|
--output-dir "$OUTPUT_DIR" \
|
|
--strategy fast \
|
|
--verbose \
|
|
--reprocess \
|
|
--input-path example-docs/fake-memo.pdf \
|
|
--work-dir "$WORK_DIR" \
|
|
--embedding-provider "langchain-huggingface" \
|
|
mongodb \
|
|
--uri "$MONGODB_URI" \
|
|
--database "$MONGODB_DATABASE_NAME" \
|
|
--collection "$DESTINATION_MONGO_COLLECTION"
|
|
|
|
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
|
--uri "$MONGODB_URI" \
|
|
--database "$MONGODB_DATABASE_NAME" \
|
|
--collection "$DESTINATION_MONGO_COLLECTION" \
|
|
check --expected-records 5
|
|
|
|
python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
|
--uri "$MONGODB_URI" \
|
|
--database "$MONGODB_DATABASE_NAME" \
|
|
--collection "$DESTINATION_MONGO_COLLECTION" \
|
|
check-vector \
|
|
--output-json "$OUTPUT_ROOT"/structured-output/$OUTPUT_FOLDER_NAME/fake-memo.pdf.json
|