From d048bb53523b11426d3c39f7cdc0aa8d15f0cf67 Mon Sep 17 00:00:00 2001 From: Timo Moeller Date: Wed, 6 Sep 2023 12:14:08 +0200 Subject: [PATCH] docs: Add minimal getting started code to showcase haystack + RAG (#5578) * init * Change question * Add TODO comment * Addressing feedback * Add local folder option. Move additional functions inside haystack.utils for easier imports * Apply Daria's review suggestions Co-authored-by: Daria Fokina * Add integration test * change string formatting Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Add outputparser to HF * Exclude anthropic test --------- Co-authored-by: Daria Fokina Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --- examples/getting_started.py | 34 +++++++++++++ examples/test_getting_started.py | 25 +++++++++ haystack/utils/__init__.py | 1 + haystack/utils/getting_started.py | 84 +++++++++++++++++++++++++++++++ 4 files changed, 144 insertions(+) create mode 100644 examples/getting_started.py create mode 100644 examples/test_getting_started.py create mode 100644 haystack/utils/getting_started.py diff --git a/examples/getting_started.py b/examples/getting_started.py new file mode 100644 index 000000000..af0d5012f --- /dev/null +++ b/examples/getting_started.py @@ -0,0 +1,34 @@ +from haystack.document_stores import InMemoryDocumentStore +from haystack.utils import build_pipeline, add_example_data, print_answers + + +def getting_started(provider, API_KEY): + """ + This getting_started example shows you how to use LLMs with your data with a technique called Retrieval Augmented Generation - RAG. + + :param provider: We are model agnostic :) Here, you can choose from: "anthropic", "cohere", "huggingface", and "openai". + :param API_KEY: The API key matching the provider. + + """ + + # We support many different databases. Here we load a simple and lightweight in-memory database. + document_store = InMemoryDocumentStore(use_bm25=True) + + # Pipelines are the main abstraction in Haystack, they connect components like LLMs and databases. + pipeline = build_pipeline(provider, API_KEY, document_store) + + # Download and add Game of Thrones TXT articles to Haystack's database. + # You can also provide a folder with your local documents. + # You might need to install additional dependencies - look inside the function for more information. + add_example_data(document_store, "data/GoT_getting_started") + + # Ask a question on the data you just added. + result = pipeline.run(query="Who is the father of Arya Stark?") + + # For details such as which documents were used to generate the answer, look into the object. + print_answers(result, details="medium") + return result + + +if __name__ == "__main__": + getting_started(provider="openai", API_KEY="ADD KEY HERE") diff --git a/examples/test_getting_started.py b/examples/test_getting_started.py new file mode 100644 index 000000000..657577c9f --- /dev/null +++ b/examples/test_getting_started.py @@ -0,0 +1,25 @@ +import os + +import pytest + +from examples.getting_started import getting_started +from haystack.schema import Answer, Document + + +@pytest.mark.integration +@pytest.mark.parametrize("provider", ["cohere", "huggingface", "openai"]) +def test_getting_started(provider): + if provider == "anthropic": + api_key = os.environ.get("ANTHROPIC_API_KEY", "") + elif provider == "cohere": + api_key = os.environ.get("COHERE_API_KEY", "") + elif provider == "huggingface": + api_key = os.environ.get("HUGGINGFACE_API_KEY", "") + elif provider == "openai": + api_key = os.environ.get("OPENAI_API_KEY", "") + result = getting_started(provider=provider, API_KEY=api_key) + + # Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly. + assert isinstance(result, dict) + assert type(result["answers"][0]) == Answer + assert type(result["documents"][0]) == Document diff --git a/haystack/utils/__init__.py b/haystack/utils/__init__.py index 6a27f4b0f..15a979408 100644 --- a/haystack/utils/__init__.py +++ b/haystack/utils/__init__.py @@ -24,3 +24,4 @@ from haystack.utils.experiment_tracking import ( from haystack.utils.early_stopping import EarlyStopping from haystack.utils.labels import aggregate_labels from haystack.utils.batching import get_batches_from_generator +from haystack.utils.getting_started import build_pipeline, add_example_data diff --git a/haystack/utils/getting_started.py b/haystack/utils/getting_started.py new file mode 100644 index 000000000..5a227b721 --- /dev/null +++ b/haystack/utils/getting_started.py @@ -0,0 +1,84 @@ +import logging +import os + +from haystack.utils import convert_files_to_docs +from haystack.utils import fetch_archive_from_http + +logger = logging.getLogger(__name__) + + +def build_pipeline(provider, API_KEY, document_store): + # Importing top-level causes a circular import + from haystack.nodes import AnswerParser, PromptNode, PromptTemplate, BM25Retriever + from haystack.pipelines import Pipeline + + provider = provider.lower() + # A retriever selects the right documents when given a question. + retriever = BM25Retriever(document_store=document_store, top_k=5) + # Load prompt for doing retrieval augmented generation from https://prompthub.deepset.ai/?prompt=deepset%2Fquestion-answering-with-references + question_answering_with_references = PromptTemplate( + prompt="deepset/question-answering-with-references", + output_parser=AnswerParser(reference_pattern=r"Document\[(\d+)\]"), + ) + # Load the LLM model + if provider == "anthropic": + prompt_node = PromptNode( + model_name_or_path="claude-2", api_key=API_KEY, default_prompt_template=question_answering_with_references + ) + elif provider == "cohere": + prompt_node = PromptNode( + model_name_or_path="command", api_key=API_KEY, default_prompt_template=question_answering_with_references + ) + elif provider == "huggingface": + # TODO: swap out for meta-llama/Llama-2-7b-chat-hf or the 40b model once supported in Haystack+HF API free tier + # The tiiuae/falcon-7b-instruct model cannot handle a complex prompt with references, so we use a very simple one + simple_QA = PromptTemplate( + prompt="deepset/question-answering", output_parser=AnswerParser(reference_pattern=r"Document\[(\d+)\]") + ) + prompt_node = PromptNode( + model_name_or_path="tiiuae/falcon-7b-instruct", api_key=API_KEY, default_prompt_template=simple_QA + ) + elif provider == "openai": + prompt_node = PromptNode( + model_name_or_path="gpt-3.5-turbo-0301", + api_key=API_KEY, + default_prompt_template=question_answering_with_references, + ) + else: + logger.error('Given unknown. Please use any of "anthropic", "cohere", "huggingface", or "openai"') + # Compose the query pipeline + query_pipeline = Pipeline() + query_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"]) + query_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"]) + + return query_pipeline + + +def add_example_data(document_store, dir): + # Importing top-level causes a circular import + from haystack.nodes import TextConverter, PreProcessor + + if dir == "data/GoT_getting_started": + # Download and add Game of Thrones TXT files + fetch_archive_from_http( + url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip", + output_dir=dir, + ) + files_to_index = [dir + "/" + f for f in os.listdir(dir)] + converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"]) + docs = [converter.convert(file_path=file, meta=None)[0] for file in files_to_index] + else: + # Here you can add a local folder with your files(.txt, .pdf, .docx). + # You might need to install additional packages with "pip install farm-haystack[ocr,preprocessing,file-conversion,pdf]". + # For more details, see: https://haystack.deepset.ai/tutorials/08_preprocessing. + # Be aware that some of your data will be sent to external APIs if you use this functionality! + files_to_index = [dir + "/" + f for f in os.listdir(dir)] + logger.info("Adding %s number of files from local disk at %s.", len(files_to_index), dir) + docs = convert_files_to_docs(dir_path=dir) + + preprocessor = PreProcessor( + split_by="word", split_length=200, split_overlap=0, split_respect_sentence_boundary=True + ) + docs_processed = preprocessor.process(docs) + + document_store.write_documents(documents=docs_processed)