#!/usr/bin/env bash # Processes all the files from s3://utic-dev-tech-fixtures/small-pdf-set/, # embeds the processed documents, and writes to results to a Pinecone index. # Structured outputs are stored in s3-small-batch-output-to-pinecone/ SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) cd "$SCRIPT_DIR"/../../.. || exit 1 # As an example we're using the s3 source connector, # however ingesting from any supported source connector is possible. # shellcheck disable=2094 PYTHONPATH=. ./unstructured/ingest/main.py \ local \ --input-path example-docs/book-war-and-peace-1225p.txt \ --output-dir local-to-pinecone \ --strategy fast \ --chunking-strategy by_title \ --embedding-provider "" \ --num-processes 2 \ --verbose \ --work-dir "" \ pinecone \ --api-key "" \ --index-name "" \ --environment "" \ --batch-size "" \ --num-processes ""