from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.pipeline_utils import build_rag_pipeline, build_indexing_pipeline
from haystack.pipeline_utils.indexing import download_files

# We are model agnostic :) In this getting started you can choose any OpenAI or Huggingface TGI generation model
generation_model = "gpt-3.5-turbo"
API_KEY = "sk-..."  # ADD YOUR KEY HERE

# We support many different databases. Here, we load a simple and lightweight in-memory database.
document_store = InMemoryDocumentStore()

# Download example files from web
files = download_files(sources=["http://www.paulgraham.com/superlinear.html"])

# Pipelines are our main abstratcion.
# Here we create a pipeline that can index TXT and HTML. You can also use your own private files.
indexing_pipeline = build_indexing_pipeline(
    document_store=document_store,
    embedding_model="intfloat/e5-base-v2",
    supported_mime_types=["text/plain", "text/html"],  # "application/pdf"
)
indexing_pipeline.run(files=files)  # you can also supply files=[path_to_directory], which is searched recursively

# RAG pipeline with vector-based retriever + LLM
rag_pipeline = build_rag_pipeline(
    document_store=document_store,
    embedding_model="intfloat/e5-base-v2",
    generation_model=generation_model,
    llm_api_key=API_KEY,
)

# For details, like which documents were used to generate the answer, look into the result object
result = rag_pipeline.run(query="What are superlinear returns and why are they important?")
print(result.data)