ryannikolaidis dd1443ab6f
feat: add Qdrant ingest destination connector (#2338)
This PR intends to add [Qdrant](https://qdrant.tech/) as a supported
ingestion destination.

- Implements CLI and programmatic usage.
- Documentation update
- Integration test script

---
Clone of #2315 to run with CI secrets

---------

Co-authored-by: Anush008 <anushshetty90@gmail.com>
Co-authored-by: Roman Isecke <136338424+rbiseck3@users.noreply.github.com>
2024-01-02 22:08:20 +00:00

88 lines
2.3 KiB
Bash
Executable File

#!/bin/bash
set -ex
DEST_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$DEST_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=qdrant-dest
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
writer_processes=$(((max_processes - 1) > 1 ? (max_processes - 1) : 2))
CONTAINTER_NAME="qdrant_test"
QDRANT_PORT=6333
QDRANT_HOST=localhost:$QDRANT_PORT
COLLECTION_NAME="qdrant-test-$(date +%s)"
EXPECTED_POINTS_COUNT=1404
RETRIES=5
function stop_docker() {
docker stop $CONTAINTER_NAME
}
docker run -d --rm \
-p 6333:$QDRANT_PORT \
--name $CONTAINTER_NAME qdrant/qdrant:latest
trap stop_docker SIGINT
trap stop_docker ERR
until curl --output /dev/null --silent --get --fail http://$QDRANT_HOST/collections; do
RETRIES=$((RETRIES - 1))
if [ "$RETRIES" -le 0 ]; then
echo "Qdrant server failed to start"
stop_docker
exit 1
fi
printf 'Waiting for Qdrant server to start...'
sleep 5
done
curl -X PUT \
http://$QDRANT_HOST/collections/"$COLLECTION_NAME" \
-H 'Content-Type: application/json' \
-d '{
"vectors": {
"size": 384,
"distance": "Cosine"
}
}'
EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER:-"langchain-huggingface"}
PYTHONPATH=. ./unstructured/ingest/main.py \
local \
--num-processes "$max_processes" \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
--reprocess \
--input-path example-docs/book-war-and-peace-1225p.txt \
--work-dir "$WORK_DIR" \
--chunk-elements \
--chunk-combine-text-under-n-chars 200 --chunk-new-after-n-chars 2500 --chunk-max-characters 38000 --chunk-multipage-sections \
--embedding-provider "langchain-huggingface" \
qdrant \
--collection-name "$COLLECTION_NAME" \
--location "http://"$QDRANT_HOST \
--batch-size 80 \
--num-processes "$writer_processes"
response=$(curl -s -X POST \
$QDRANT_HOST/collections/"$COLLECTION_NAME"/points/count \
-H 'Content-Type: application/json' \
-d '{
"exact": true
}')
count=$(echo "$response" | jq -r '.result.count')
if [ "$count" -ne $EXPECTED_POINTS_COUNT ]; then
echo "Points count assertion failed. Expected: $EXPECTED. Got: $count. Test failed."
stop_docker
exit 1
fi
stop_docker