2024-01-10 21:20:42 +01:00
|
|
|
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
2023-12-09 19:01:13 +01:00
|
|
|
from haystack.pipeline_utils import build_rag_pipeline, build_indexing_pipeline
|
|
|
|
from haystack.pipeline_utils.indexing import download_files
|
|
|
|
|
|
|
|
# We are model agnostic :) In this getting started you can choose any OpenAI or Huggingface TGI generation model
|
|
|
|
generation_model = "gpt-3.5-turbo"
|
|
|
|
API_KEY = "sk-..." # ADD YOUR KEY HERE
|
|
|
|
|
|
|
|
# We support many different databases. Here, we load a simple and lightweight in-memory database.
|
|
|
|
document_store = InMemoryDocumentStore()
|
|
|
|
|
|
|
|
# Download example files from web
|
|
|
|
files = download_files(sources=["http://www.paulgraham.com/superlinear.html"])
|
|
|
|
|
|
|
|
# Pipelines are our main abstratcion.
|
|
|
|
# Here we create a pipeline that can index TXT and HTML. You can also use your own private files.
|
|
|
|
indexing_pipeline = build_indexing_pipeline(
|
|
|
|
document_store=document_store,
|
|
|
|
embedding_model="intfloat/e5-base-v2",
|
|
|
|
supported_mime_types=["text/plain", "text/html"], # "application/pdf"
|
|
|
|
)
|
|
|
|
indexing_pipeline.run(files=files) # you can also supply files=[path_to_directory], which is searched recursively
|
|
|
|
|
|
|
|
# RAG pipeline with vector-based retriever + LLM
|
|
|
|
rag_pipeline = build_rag_pipeline(
|
|
|
|
document_store=document_store,
|
|
|
|
embedding_model="intfloat/e5-base-v2",
|
|
|
|
generation_model=generation_model,
|
|
|
|
llm_api_key=API_KEY,
|
|
|
|
)
|
|
|
|
|
|
|
|
# For details, like which documents were used to generate the answer, look into the result object
|
|
|
|
result = rag_pipeline.run(query="What are superlinear returns and why are they important?")
|
|
|
|
print(result.data)
|