From d048bb53523b11426d3c39f7cdc0aa8d15f0cf67 Mon Sep 17 00:00:00 2001
From: Timo Moeller <timo.moeller@deepset.ai>
Date: Wed, 6 Sep 2023 12:14:08 +0200
Subject: [PATCH] docs: Add minimal getting started code to showcase haystack +
 RAG  (#5578)

* init

* Change question

* Add TODO comment

* Addressing feedback

* Add local folder option. Move additional functions inside haystack.utils for easier imports

* Apply Daria's review suggestions

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>

* Add integration test

* change string formatting

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>

* Add outputparser to HF

* Exclude anthropic test

---------

Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
---
 examples/getting_started.py       | 34 +++++++++++++
 examples/test_getting_started.py  | 25 +++++++++
 haystack/utils/__init__.py        |  1 +
 haystack/utils/getting_started.py | 84 +++++++++++++++++++++++++++++++
 4 files changed, 144 insertions(+)
 create mode 100644 examples/getting_started.py
 create mode 100644 examples/test_getting_started.py
 create mode 100644 haystack/utils/getting_started.py
diff --git a/examples/getting_started.py b/examples/getting_started.py
new file mode 100644
index 000000000..af0d5012f
--- /dev/null
+++ b/examples/getting_started.py
@@ -0,0 +1,34 @@
+from haystack.document_stores import InMemoryDocumentStore
+from haystack.utils import build_pipeline, add_example_data, print_answers
+
+
+def getting_started(provider, API_KEY):
+    """
+    This getting_started example shows you how to use LLMs with your data with a technique called Retrieval Augmented Generation - RAG.
+
+    :param provider: We are model agnostic :) Here, you can choose from: "anthropic", "cohere", "huggingface", and "openai".
+    :param API_KEY: The API key matching the provider.
+
+    """
+
+    # We support many different databases. Here we load a simple and lightweight in-memory database.
+    document_store = InMemoryDocumentStore(use_bm25=True)
+
+    # Pipelines are the main abstraction in Haystack, they connect components like LLMs and databases.
+    pipeline = build_pipeline(provider, API_KEY, document_store)
+
+    # Download and add Game of Thrones TXT articles to Haystack's database.
+    # You can also provide a folder with your local documents.
+    # You might need to install additional dependencies - look inside the function for more information.
+    add_example_data(document_store, "data/GoT_getting_started")
+
+    # Ask a question on the data you just added.
+    result = pipeline.run(query="Who is the father of Arya Stark?")
+
+    # For details such as which documents were used to generate the answer, look into the <result> object.
+    print_answers(result, details="medium")
+    return result
+
+
+if __name__ == "__main__":
+    getting_started(provider="openai", API_KEY="ADD KEY HERE")
diff --git a/examples/test_getting_started.py b/examples/test_getting_started.py
new file mode 100644
index 000000000..657577c9f
--- /dev/null
+++ b/examples/test_getting_started.py
@@ -0,0 +1,25 @@
+import os
+
+import pytest
+
+from examples.getting_started import getting_started
+from haystack.schema import Answer, Document
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("provider", ["cohere", "huggingface", "openai"])
+def test_getting_started(provider):
+    if provider == "anthropic":
+        api_key = os.environ.get("ANTHROPIC_API_KEY", "")
+    elif provider == "cohere":
+        api_key = os.environ.get("COHERE_API_KEY", "")
+    elif provider == "huggingface":
+        api_key = os.environ.get("HUGGINGFACE_API_KEY", "")
+    elif provider == "openai":
+        api_key = os.environ.get("OPENAI_API_KEY", "")
+    result = getting_started(provider=provider, API_KEY=api_key)
+
+    # Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly.
+    assert isinstance(result, dict)
+    assert type(result["answers"][0]) == Answer
+    assert type(result["documents"][0]) == Document
diff --git a/haystack/utils/__init__.py b/haystack/utils/__init__.py
index 6a27f4b0f..15a979408 100644
--- a/haystack/utils/__init__.py
+++ b/haystack/utils/__init__.py
@@ -24,3 +24,4 @@ from haystack.utils.experiment_tracking import (
 from haystack.utils.early_stopping import EarlyStopping
 from haystack.utils.labels import aggregate_labels
 from haystack.utils.batching import get_batches_from_generator
+from haystack.utils.getting_started import build_pipeline, add_example_data
diff --git a/haystack/utils/getting_started.py b/haystack/utils/getting_started.py
new file mode 100644
index 000000000..5a227b721
--- /dev/null
+++ b/haystack/utils/getting_started.py
@@ -0,0 +1,84 @@
+import logging
+import os
+
+from haystack.utils import convert_files_to_docs
+from haystack.utils import fetch_archive_from_http
+
+logger = logging.getLogger(__name__)
+
+
+def build_pipeline(provider, API_KEY, document_store):
+    # Importing top-level causes a circular import
+    from haystack.nodes import AnswerParser, PromptNode, PromptTemplate, BM25Retriever
+    from haystack.pipelines import Pipeline
+
+    provider = provider.lower()
+    # A retriever selects the right documents when given a question.
+    retriever = BM25Retriever(document_store=document_store, top_k=5)
+    # Load prompt for doing retrieval augmented generation from https://prompthub.deepset.ai/?prompt=deepset%2Fquestion-answering-with-references
+    question_answering_with_references = PromptTemplate(
+        prompt="deepset/question-answering-with-references",
+        output_parser=AnswerParser(reference_pattern=r"Document\[(\d+)\]"),
+    )
+    # Load the LLM model
+    if provider == "anthropic":
+        prompt_node = PromptNode(
+            model_name_or_path="claude-2", api_key=API_KEY, default_prompt_template=question_answering_with_references
+        )
+    elif provider == "cohere":
+        prompt_node = PromptNode(
+            model_name_or_path="command", api_key=API_KEY, default_prompt_template=question_answering_with_references
+        )
+    elif provider == "huggingface":
+        # TODO: swap out for meta-llama/Llama-2-7b-chat-hf or the 40b model once supported in Haystack+HF API free tier
+        # The tiiuae/falcon-7b-instruct model cannot handle a complex prompt with references, so we use a very simple one
+        simple_QA = PromptTemplate(
+            prompt="deepset/question-answering", output_parser=AnswerParser(reference_pattern=r"Document\[(\d+)\]")
+        )
+        prompt_node = PromptNode(
+            model_name_or_path="tiiuae/falcon-7b-instruct", api_key=API_KEY, default_prompt_template=simple_QA
+        )
+    elif provider == "openai":
+        prompt_node = PromptNode(
+            model_name_or_path="gpt-3.5-turbo-0301",
+            api_key=API_KEY,
+            default_prompt_template=question_answering_with_references,
+        )
+    else:
+        logger.error('Given <provider> unknown. Please use any of "anthropic", "cohere", "huggingface", or "openai"')
+    # Compose the query pipeline
+    query_pipeline = Pipeline()
+    query_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
+    query_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])
+
+    return query_pipeline
+
+
+def add_example_data(document_store, dir):
+    # Importing top-level causes a circular import
+    from haystack.nodes import TextConverter, PreProcessor
+
+    if dir == "data/GoT_getting_started":
+        # Download and add Game of Thrones TXT files
+        fetch_archive_from_http(
+            url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip",
+            output_dir=dir,
+        )
+        files_to_index = [dir + "/" + f for f in os.listdir(dir)]
+        converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
+        docs = [converter.convert(file_path=file, meta=None)[0] for file in files_to_index]
+    else:
+        # Here you can add a local folder with your files(.txt, .pdf, .docx).
+        # You might need to install additional packages with "pip install farm-haystack[ocr,preprocessing,file-conversion,pdf]".
+        # For more details, see: https://haystack.deepset.ai/tutorials/08_preprocessing.
+        # Be aware that some of your data will be sent to external APIs if you use this functionality!
+        files_to_index = [dir + "/" + f for f in os.listdir(dir)]
+        logger.info("Adding %s number of files from local disk at %s.", len(files_to_index), dir)
+        docs = convert_files_to_docs(dir_path=dir)
+
+    preprocessor = PreProcessor(
+        split_by="word", split_length=200, split_overlap=0, split_respect_sentence_boundary=True
+    )
+    docs_processed = preprocessor.process(docs)
+
+    document_store.write_documents(documents=docs_processed)