mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-03 19:43:24 +00:00 
			
		
		
		
	This PR intends to add [Qdrant](https://qdrant.tech/) as a supported ingestion destination. - Implements CLI and programmatic usage. - Documentation update - Integration test script --- Clone of #2315 to run with CI secrets --------- Co-authored-by: Anush008 <anushshetty90@gmail.com> Co-authored-by: Roman Isecke <136338424+rbiseck3@users.noreply.github.com>
		
			
				
	
	
		
			88 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			88 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
#!/bin/bash
 | 
						|
 | 
						|
set -ex
 | 
						|
 | 
						|
DEST_PATH=$(dirname "$(realpath "$0")")
 | 
						|
SCRIPT_DIR=$(dirname "$DEST_PATH")
 | 
						|
cd "$SCRIPT_DIR"/.. || exit 1
 | 
						|
OUTPUT_FOLDER_NAME=qdrant-dest
 | 
						|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
 | 
						|
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
 | 
						|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
 | 
						|
writer_processes=$(((max_processes - 1) > 1 ? (max_processes - 1) : 2))
 | 
						|
CONTAINTER_NAME="qdrant_test"
 | 
						|
QDRANT_PORT=6333
 | 
						|
QDRANT_HOST=localhost:$QDRANT_PORT
 | 
						|
COLLECTION_NAME="qdrant-test-$(date +%s)"
 | 
						|
EXPECTED_POINTS_COUNT=1404
 | 
						|
RETRIES=5
 | 
						|
 | 
						|
function stop_docker() {
 | 
						|
  docker stop $CONTAINTER_NAME
 | 
						|
}
 | 
						|
 | 
						|
docker run -d --rm \
 | 
						|
  -p 6333:$QDRANT_PORT \
 | 
						|
  --name $CONTAINTER_NAME qdrant/qdrant:latest
 | 
						|
 | 
						|
trap stop_docker SIGINT
 | 
						|
trap stop_docker ERR
 | 
						|
 | 
						|
until curl --output /dev/null --silent --get --fail http://$QDRANT_HOST/collections; do
 | 
						|
  RETRIES=$((RETRIES - 1))
 | 
						|
  if [ "$RETRIES" -le 0 ]; then
 | 
						|
    echo "Qdrant server failed to start"
 | 
						|
    stop_docker
 | 
						|
    exit 1
 | 
						|
  fi
 | 
						|
  printf 'Waiting for Qdrant server to start...'
 | 
						|
  sleep 5
 | 
						|
done
 | 
						|
 | 
						|
curl -X PUT \
 | 
						|
  http://$QDRANT_HOST/collections/"$COLLECTION_NAME" \
 | 
						|
  -H 'Content-Type: application/json' \
 | 
						|
  -d '{
 | 
						|
    "vectors": {
 | 
						|
      "size": 384,
 | 
						|
      "distance": "Cosine"
 | 
						|
    }
 | 
						|
}'
 | 
						|
 | 
						|
EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER:-"langchain-huggingface"}
 | 
						|
 | 
						|
PYTHONPATH=. ./unstructured/ingest/main.py \
 | 
						|
  local \
 | 
						|
  --num-processes "$max_processes" \
 | 
						|
  --output-dir "$OUTPUT_DIR" \
 | 
						|
  --strategy fast \
 | 
						|
  --verbose \
 | 
						|
  --reprocess \
 | 
						|
  --input-path example-docs/book-war-and-peace-1225p.txt \
 | 
						|
  --work-dir "$WORK_DIR" \
 | 
						|
  --chunk-elements \
 | 
						|
  --chunk-combine-text-under-n-chars 200 --chunk-new-after-n-chars 2500 --chunk-max-characters 38000 --chunk-multipage-sections \
 | 
						|
  --embedding-provider "langchain-huggingface" \
 | 
						|
  qdrant \
 | 
						|
  --collection-name "$COLLECTION_NAME" \
 | 
						|
  --location "http://"$QDRANT_HOST \
 | 
						|
  --batch-size 80 \
 | 
						|
  --num-processes "$writer_processes"
 | 
						|
 | 
						|
response=$(curl -s -X POST \
 | 
						|
  $QDRANT_HOST/collections/"$COLLECTION_NAME"/points/count \
 | 
						|
  -H 'Content-Type: application/json' \
 | 
						|
  -d '{
 | 
						|
     "exact": true
 | 
						|
}')
 | 
						|
 | 
						|
count=$(echo "$response" | jq -r '.result.count')
 | 
						|
 | 
						|
if [ "$count" -ne $EXPECTED_POINTS_COUNT ]; then
 | 
						|
  echo "Points count assertion failed. Expected: $EXPECTED. Got: $count. Test failed."
 | 
						|
  stop_docker
 | 
						|
  exit 1
 | 
						|
fi
 | 
						|
 | 
						|
stop_docker
 |