mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-06-26 22:00:13 +00:00
docs: Add minimal getting started code to showcase haystack + RAG (#5578)
* init * Change question * Add TODO comment * Addressing feedback * Add local folder option. Move additional functions inside haystack.utils for easier imports * Apply Daria's review suggestions Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> * Add integration test * change string formatting Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Add outputparser to HF * Exclude anthropic test --------- Co-authored-by: Daria Fokina <daria.fokina@deepset.ai> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
This commit is contained in:
parent
10d6886255
commit
d048bb5352
34
examples/getting_started.py
Normal file
34
examples/getting_started.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
from haystack.document_stores import InMemoryDocumentStore
|
||||||
|
from haystack.utils import build_pipeline, add_example_data, print_answers
|
||||||
|
|
||||||
|
|
||||||
|
def getting_started(provider, API_KEY):
|
||||||
|
"""
|
||||||
|
This getting_started example shows you how to use LLMs with your data with a technique called Retrieval Augmented Generation - RAG.
|
||||||
|
|
||||||
|
:param provider: We are model agnostic :) Here, you can choose from: "anthropic", "cohere", "huggingface", and "openai".
|
||||||
|
:param API_KEY: The API key matching the provider.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# We support many different databases. Here we load a simple and lightweight in-memory database.
|
||||||
|
document_store = InMemoryDocumentStore(use_bm25=True)
|
||||||
|
|
||||||
|
# Pipelines are the main abstraction in Haystack, they connect components like LLMs and databases.
|
||||||
|
pipeline = build_pipeline(provider, API_KEY, document_store)
|
||||||
|
|
||||||
|
# Download and add Game of Thrones TXT articles to Haystack's database.
|
||||||
|
# You can also provide a folder with your local documents.
|
||||||
|
# You might need to install additional dependencies - look inside the function for more information.
|
||||||
|
add_example_data(document_store, "data/GoT_getting_started")
|
||||||
|
|
||||||
|
# Ask a question on the data you just added.
|
||||||
|
result = pipeline.run(query="Who is the father of Arya Stark?")
|
||||||
|
|
||||||
|
# For details such as which documents were used to generate the answer, look into the <result> object.
|
||||||
|
print_answers(result, details="medium")
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
getting_started(provider="openai", API_KEY="ADD KEY HERE")
|
25
examples/test_getting_started.py
Normal file
25
examples/test_getting_started.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from examples.getting_started import getting_started
|
||||||
|
from haystack.schema import Answer, Document
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
@pytest.mark.parametrize("provider", ["cohere", "huggingface", "openai"])
|
||||||
|
def test_getting_started(provider):
|
||||||
|
if provider == "anthropic":
|
||||||
|
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||||
|
elif provider == "cohere":
|
||||||
|
api_key = os.environ.get("COHERE_API_KEY", "")
|
||||||
|
elif provider == "huggingface":
|
||||||
|
api_key = os.environ.get("HUGGINGFACE_API_KEY", "")
|
||||||
|
elif provider == "openai":
|
||||||
|
api_key = os.environ.get("OPENAI_API_KEY", "")
|
||||||
|
result = getting_started(provider=provider, API_KEY=api_key)
|
||||||
|
|
||||||
|
# Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly.
|
||||||
|
assert isinstance(result, dict)
|
||||||
|
assert type(result["answers"][0]) == Answer
|
||||||
|
assert type(result["documents"][0]) == Document
|
@ -24,3 +24,4 @@ from haystack.utils.experiment_tracking import (
|
|||||||
from haystack.utils.early_stopping import EarlyStopping
|
from haystack.utils.early_stopping import EarlyStopping
|
||||||
from haystack.utils.labels import aggregate_labels
|
from haystack.utils.labels import aggregate_labels
|
||||||
from haystack.utils.batching import get_batches_from_generator
|
from haystack.utils.batching import get_batches_from_generator
|
||||||
|
from haystack.utils.getting_started import build_pipeline, add_example_data
|
||||||
|
84
haystack/utils/getting_started.py
Normal file
84
haystack/utils/getting_started.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
from haystack.utils import convert_files_to_docs
|
||||||
|
from haystack.utils import fetch_archive_from_http
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def build_pipeline(provider, API_KEY, document_store):
|
||||||
|
# Importing top-level causes a circular import
|
||||||
|
from haystack.nodes import AnswerParser, PromptNode, PromptTemplate, BM25Retriever
|
||||||
|
from haystack.pipelines import Pipeline
|
||||||
|
|
||||||
|
provider = provider.lower()
|
||||||
|
# A retriever selects the right documents when given a question.
|
||||||
|
retriever = BM25Retriever(document_store=document_store, top_k=5)
|
||||||
|
# Load prompt for doing retrieval augmented generation from https://prompthub.deepset.ai/?prompt=deepset%2Fquestion-answering-with-references
|
||||||
|
question_answering_with_references = PromptTemplate(
|
||||||
|
prompt="deepset/question-answering-with-references",
|
||||||
|
output_parser=AnswerParser(reference_pattern=r"Document\[(\d+)\]"),
|
||||||
|
)
|
||||||
|
# Load the LLM model
|
||||||
|
if provider == "anthropic":
|
||||||
|
prompt_node = PromptNode(
|
||||||
|
model_name_or_path="claude-2", api_key=API_KEY, default_prompt_template=question_answering_with_references
|
||||||
|
)
|
||||||
|
elif provider == "cohere":
|
||||||
|
prompt_node = PromptNode(
|
||||||
|
model_name_or_path="command", api_key=API_KEY, default_prompt_template=question_answering_with_references
|
||||||
|
)
|
||||||
|
elif provider == "huggingface":
|
||||||
|
# TODO: swap out for meta-llama/Llama-2-7b-chat-hf or the 40b model once supported in Haystack+HF API free tier
|
||||||
|
# The tiiuae/falcon-7b-instruct model cannot handle a complex prompt with references, so we use a very simple one
|
||||||
|
simple_QA = PromptTemplate(
|
||||||
|
prompt="deepset/question-answering", output_parser=AnswerParser(reference_pattern=r"Document\[(\d+)\]")
|
||||||
|
)
|
||||||
|
prompt_node = PromptNode(
|
||||||
|
model_name_or_path="tiiuae/falcon-7b-instruct", api_key=API_KEY, default_prompt_template=simple_QA
|
||||||
|
)
|
||||||
|
elif provider == "openai":
|
||||||
|
prompt_node = PromptNode(
|
||||||
|
model_name_or_path="gpt-3.5-turbo-0301",
|
||||||
|
api_key=API_KEY,
|
||||||
|
default_prompt_template=question_answering_with_references,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.error('Given <provider> unknown. Please use any of "anthropic", "cohere", "huggingface", or "openai"')
|
||||||
|
# Compose the query pipeline
|
||||||
|
query_pipeline = Pipeline()
|
||||||
|
query_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
|
||||||
|
query_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])
|
||||||
|
|
||||||
|
return query_pipeline
|
||||||
|
|
||||||
|
|
||||||
|
def add_example_data(document_store, dir):
|
||||||
|
# Importing top-level causes a circular import
|
||||||
|
from haystack.nodes import TextConverter, PreProcessor
|
||||||
|
|
||||||
|
if dir == "data/GoT_getting_started":
|
||||||
|
# Download and add Game of Thrones TXT files
|
||||||
|
fetch_archive_from_http(
|
||||||
|
url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip",
|
||||||
|
output_dir=dir,
|
||||||
|
)
|
||||||
|
files_to_index = [dir + "/" + f for f in os.listdir(dir)]
|
||||||
|
converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
|
||||||
|
docs = [converter.convert(file_path=file, meta=None)[0] for file in files_to_index]
|
||||||
|
else:
|
||||||
|
# Here you can add a local folder with your files(.txt, .pdf, .docx).
|
||||||
|
# You might need to install additional packages with "pip install farm-haystack[ocr,preprocessing,file-conversion,pdf]".
|
||||||
|
# For more details, see: https://haystack.deepset.ai/tutorials/08_preprocessing.
|
||||||
|
# Be aware that some of your data will be sent to external APIs if you use this functionality!
|
||||||
|
files_to_index = [dir + "/" + f for f in os.listdir(dir)]
|
||||||
|
logger.info("Adding %s number of files from local disk at %s.", len(files_to_index), dir)
|
||||||
|
docs = convert_files_to_docs(dir_path=dir)
|
||||||
|
|
||||||
|
preprocessor = PreProcessor(
|
||||||
|
split_by="word", split_length=200, split_overlap=0, split_respect_sentence_boundary=True
|
||||||
|
)
|
||||||
|
docs_processed = preprocessor.process(docs)
|
||||||
|
|
||||||
|
document_store.write_documents(documents=docs_processed)
|
Loading…
x
Reference in New Issue
Block a user