From 1e8af84ecc7f59afffbcb1a28e3779031cc81987 Mon Sep 17 00:00:00 2001
From: Branden Chan <33759007+brandenchan@users.noreply.github.com>
Date: Thu, 19 Nov 2020 14:58:27 +0100
Subject: [PATCH] Make more changes to documentation (#578)

* First batch of changes

* Add RAG tutorial links

* Prettify RAG tutorial

* draft of generator doc

* Add text

* Complete generator page

* Create optimization section

* Split intro

* Fix formatting tutorial 7
---
 README.md                                |   5 +
 docs/_src/tutorials/tutorials/1.md       |   6 +-
 docs/_src/tutorials/tutorials/2.md       |   6 +-
 docs/_src/tutorials/tutorials/3.md       |   6 +-
 docs/_src/tutorials/tutorials/4.md       |  19 +-
 docs/_src/tutorials/tutorials/5.md       |  14 +-
 docs/_src/tutorials/tutorials/6.md       |  13 +-
 docs/_src/tutorials/tutorials/7.md       | 174 +++++++++++++++++
 docs/_src/tutorials/tutorials/headers.py |   9 +-
 docs/_src/usage/usage/document_store.md  |   2 +-
 docs/_src/usage/usage/generator.md       |  29 +++
 docs/_src/usage/usage/intro.md           |  60 ------
 docs/_src/usage/usage/optimization.md    |  58 ++++++
 docs/_src/usage/usage/preprocessing.md   |  28 +--
 docs/_src/usage/usage/retriever.md       |   2 +
 docs/_src/usage/usage/terms.md           |   1 +
 docs/_src/usage/usage/use_cases.md       |  69 +++++++
 tutorials/Tutorial7_RAG_Generator.ipynb  | 232 +++++++++++++++++------
 18 files changed, 563 insertions(+), 170 deletions(-)
 create mode 100644 docs/_src/tutorials/tutorials/7.md
 create mode 100644 docs/_src/usage/usage/generator.md
 create mode 100644 docs/_src/usage/usage/optimization.md
 create mode 100644 docs/_src/usage/usage/use_cases.md

diff --git a/README.md b/README.md
index 1ae0fbc48..d247579af 100644
--- a/README.md
+++ b/README.md
@@ -124,6 +124,11 @@ We recommend Elasticsearch or FAISS, but have also more light-weight options for
     [Jupyter noteboook](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb)
     or
     [Colab](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb)
+-   Tutorial 7 - Generative QA via "Retrieval-Augmented Generation":
+    [Jupyter noteboook](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial7_RAG_Generator.ipynb)
+    or
+    [Colab](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial7_RAG_Generator.ipynb)
+
 
 ## Quick Tour
 [File Conversion](https://github.com/deepset-ai/haystack/blob/master/README.md#1-file-conversion) | [Preprocessing](https://github.com/deepset-ai/haystack/blob/master/README.md#2-preprocessing) | [DocumentStores](https://github.com/deepset-ai/haystack/blob/master/README.md#3-documentstores) | [Retrievers](https://github.com/deepset-ai/haystack/blob/master/README.md#5-retrievers) | [Readers](https://github.com/deepset-ai/haystack/blob/master/README.md#5-readers) | [REST API](https://github.com/deepset-ai/haystack/blob/master/README.md#6-rest-api) |  [Labeling Tool](https://github.com/deepset-ai/haystack/blob/master/README.md#7-labeling-tool) 
diff --git a/docs/_src/tutorials/tutorials/1.md b/docs/_src/tutorials/tutorials/1.md
index 92faee639..d7ffdb814 100644
--- a/docs/_src/tutorials/tutorials/1.md
+++ b/docs/_src/tutorials/tutorials/1.md
@@ -28,9 +28,11 @@ marvellous seven kingdoms...
 # Install the latest release of Haystack in your own environment 
 #! pip install farm-haystack
 
-# Install the latest master of Haystack and install the version of torch that works with the colab GPUs
+# Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
-!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
 ```
 
 
diff --git a/docs/_src/tutorials/tutorials/2.md b/docs/_src/tutorials/tutorials/2.md
index 89c053c3f..c3bc0433b 100644
--- a/docs/_src/tutorials/tutorials/2.md
+++ b/docs/_src/tutorials/tutorials/2.md
@@ -22,9 +22,11 @@ This tutorial shows you how to fine-tune a pretrained model on your own dataset.
 # Install the latest release of Haystack in your own environment 
 #! pip install farm-haystack
 
-# Install the latest master of Haystack and install the version of torch that works with the colab GPUs
+# Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
-!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
 ```
 
 
diff --git a/docs/_src/tutorials/tutorials/3.md b/docs/_src/tutorials/tutorials/3.md
index 2a42ae103..ebd353bae 100644
--- a/docs/_src/tutorials/tutorials/3.md
+++ b/docs/_src/tutorials/tutorials/3.md
@@ -22,9 +22,11 @@ If you are interested in more feature-rich Elasticsearch, then please refer to t
 # Install the latest release of Haystack in your own environment 
 #! pip install farm-haystack
 
-# Install the latest master of Haystack and install the version of torch that works with the colab GPUs
+# Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
-!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
 ```
 
 
diff --git a/docs/_src/tutorials/tutorials/4.md b/docs/_src/tutorials/tutorials/4.md
index 28f9025d6..9a8dff48f 100644
--- a/docs/_src/tutorials/tutorials/4.md
+++ b/docs/_src/tutorials/tutorials/4.md
@@ -31,9 +31,11 @@ In some use cases, a combination of extractive QA and FAQ-style can also be an i
 # Install the latest release of Haystack in your own environment 
 #! pip install farm-haystack
 
-# Install the latest master of Haystack and install the version of torch that works with the colab GPUs
+# Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
-!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
 ```
 
 
@@ -54,19 +56,19 @@ You can start Elasticsearch on your local machine instance using Docker. If Dock
 
 ```python
 # Recommended: Start Elasticsearch using Docker
-# ! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2
+# ! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2
 ```
 
 
 ```python
 # In Colab / No Docker environments: Start Elasticsearch from source
-! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q
-! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz
-! chown -R daemon:daemon elasticsearch-7.6.2
+! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+! chown -R daemon:daemon elasticsearch-7.9.2
 
 import os
 from subprocess import Popen, PIPE, STDOUT
-es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],
+es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
                    stdout=PIPE, stderr=STDOUT,
                    preexec_fn=lambda: os.setuid(1)  # as daemon
                   )
@@ -98,7 +100,7 @@ We can use the `EmbeddingRetriever` for this purpose and specify a model that we
 
 
 ```python
-retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=False)
+retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=True)
 ```
 
 ### Prepare & Index FAQ data
@@ -121,7 +123,6 @@ print(df.head())
 # Get embeddings for our questions from the FAQs
 questions = list(df["question"].values)
 df["question_emb"] = retriever.embed_queries(texts=questions)
-df["question_emb"] = df["question_emb"].apply(list) # convert from numpy to list for ES ingestion
 df = df.rename(columns={"answer": "text"})
 
 # Convert Dataframe to list of dicts and index them in our DocumentStore
diff --git a/docs/_src/tutorials/tutorials/5.md b/docs/_src/tutorials/tutorials/5.md
index 85475f202..1d6fe4274 100644
--- a/docs/_src/tutorials/tutorials/5.md
+++ b/docs/_src/tutorials/tutorials/5.md
@@ -21,21 +21,23 @@ You can start Elasticsearch on your local machine instance using Docker. If Dock
 # Install the latest release of Haystack in your own environment 
 #! pip install farm-haystack
 
-# Install the latest master of Haystack and install the version of torch that works with the colab GPUs
+# Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
-!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
 ```
 
 
 ```python
 # In Colab / No Docker environments: Start Elasticsearch from source
-! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q
-! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz
-! chown -R daemon:daemon elasticsearch-7.6.2
+! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+! chown -R daemon:daemon elasticsearch-7.9.2
 
 import os
 from subprocess import Popen, PIPE, STDOUT
-es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],
+es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
                    stdout=PIPE, stderr=STDOUT,
                    preexec_fn=lambda: os.setuid(1)  # as daemon
                   )
diff --git a/docs/_src/tutorials/tutorials/6.md b/docs/_src/tutorials/tutorials/6.md
index ec426daad..9de9b0c3d 100644
--- a/docs/_src/tutorials/tutorials/6.md
+++ b/docs/_src/tutorials/tutorials/6.md
@@ -77,9 +77,11 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
 # Install the latest release of Haystack in your own environment 
 #! pip install farm-haystack
 
-# Install the latest master of Haystack and install the version of torch that works with the colab GPUs
+# Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
-!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
 ```
 
 
@@ -142,11 +144,12 @@ from haystack.retriever.dense import DensePassageRetriever
 retriever = DensePassageRetriever(document_store=document_store,
                                   query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                   passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
+                                  max_seq_len_query=64,
+                                  max_seq_len_passage=256,
+                                  batch_size=16,
                                   use_gpu=True,
                                   embed_title=True,
-                                  max_seq_len=256,
-                                  batch_size=16,
-                                  remove_sep_tok_from_untitled_passages=True)
+                                  use_fast_tokenizers=True)
 # Important: 
 # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
 # previously indexed documents and update their embedding representation. 
diff --git a/docs/_src/tutorials/tutorials/7.md b/docs/_src/tutorials/tutorials/7.md
new file mode 100644
index 000000000..c89df7ab5
--- /dev/null
+++ b/docs/_src/tutorials/tutorials/7.md
@@ -0,0 +1,174 @@
+<!---
+title: "Tutorial 7"
+metaTitle: "Generative QA"
+metaDescription: ""
+slug: "/docs/tutorial7"
+date: "2020-11-12"
+id: "tutorial7md"
+--->
+
+# Generative QA with "Retrieval-Augmented Generation"
+
+While extractive QA highlights the span of text that answers a query,
+generative QA can return a novel text answer that it has composed.
+In this tutorial, you will learn how to set up a generative system using the
+[RAG model](https://arxiv.org/abs/2005.11401) which conditions the
+answer generator on a set of retrieved documents.
+
+Here are the packages and imports that we'll need:
+
+
+```python
+!pip install git+https://github.com/deepset-ai/haystack.git
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
+```
+
+
+```python
+from typing import List
+import requests
+import pandas as pd
+from haystack import Document
+from haystack.document_store.faiss import FAISSDocumentStore
+from haystack.generator.transformers import RAGenerator
+from haystack.retriever.dense import DensePassageRetriever
+```
+
+Let's download a csv containing some sample text and preprocess the data.
+
+
+
+```python
+# Download sample
+temp = requests.get("https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv")
+open('small_generator_dataset.csv', 'wb').write(temp.content)
+
+# Create dataframe with columns "title" and "text"
+df = pd.read_csv("small_generator_dataset.csv", sep=',')
+# Minimal cleaning
+df.fillna(value="", inplace=True)
+
+print(df.head())
+```
+
+We can cast our data into Haystack Document objects.
+Alternatively, we can also just use dictionaries with "text" and "meta" fields
+
+
+```python
+# Use data to initialize Document objects
+titles = list(df["title"].values)
+texts = list(df["text"].values)
+documents: List[Document] = []
+for title, text in zip(titles, texts):
+    documents.append(
+        Document(
+            text=text,
+            meta={
+                "name": title or ""
+            }
+        )
+    )
+```
+
+Here we initialize the FAISSDocumentStore, DensePassageRetriever and RAGenerator.
+FAISS is chosen here since it is optimized vector storage.
+
+
+```python
+# Initialize FAISS document store.
+# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
+document_store = FAISSDocumentStore(
+    faiss_index_factory_str="Flat",
+    return_embedding=True
+)
+
+# Initialize DPR Retriever to encode documents, encode question and query documents
+retriever = DensePassageRetriever(
+    document_store=document_store,
+    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
+    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
+    use_gpu=False,
+    embed_title=True,
+)
+
+# Initialize RAG Generator
+generator = RAGenerator(
+    model_name_or_path="facebook/rag-token-nq",
+    use_gpu=False,
+    top_k_answers=1,
+    max_length=200,
+    min_length=2,
+    embed_title=True,
+    num_beams=2,
+)
+```
+
+We write documents to the DocumentStore, first by deleting any remaining documents then calling `write_documents()`.
+The `update_embeddings()` method uses the retriever to create an embedding for each document.
+
+
+
+```python
+# Delete existing documents in documents store
+document_store.delete_all_documents()
+
+# Write documents to document store
+document_store.write_documents(documents)
+
+# Add documents embeddings to index
+document_store.update_embeddings(
+    retriever=retriever
+)
+```
+
+Here are our questions:
+
+
+```python
+QUESTIONS = [
+    "who got the first nobel prize in physics",
+    "when is the next deadpool movie being released",
+    "which mode is used for short wave broadcast service",
+    "who is the owner of reading football club",
+    "when is the next scandal episode coming out",
+    "when is the last time the philadelphia won the superbowl",
+    "what is the most current adobe flash player version",
+    "how many episodes are there in dragon ball z",
+    "what is the first step in the evolution of the eye",
+    "where is gall bladder situated in human body",
+    "what is the main mineral in lithium batteries",
+    "who is the president of usa right now",
+    "where do the greasers live in the outsiders",
+    "panda is a national animal of which country",
+    "what is the name of manchester united stadium",
+]
+```
+
+Now let's run our system!
+The retriever will pick out a small subset of documents that it finds relevant.
+These are used to condition the generator as it generates the answer.
+What it should return then are novel text spans that form and answer to your question!
+
+
+```python
+# Now generate an answer for each question
+for question in QUESTIONS:
+    # Retrieve related documents from retriever
+    retriever_results = retriever.retrieve(
+        query=question
+    )
+
+    # Now generate answer from question and retrieved documents
+    predicted_result = generator.predict(
+        question=question,
+        documents=retriever_results,
+        top_k=1
+    )
+
+    # Print you answer
+    answers = predicted_result["answers"]
+    print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')
+```
diff --git a/docs/_src/tutorials/tutorials/headers.py b/docs/_src/tutorials/tutorials/headers.py
index b09e2a4fc..a7cb8c221 100644
--- a/docs/_src/tutorials/tutorials/headers.py
+++ b/docs/_src/tutorials/tutorials/headers.py
@@ -46,6 +46,13 @@ metaDescription: ""
 slug: "/docs/tutorial6"
 date: "2020-09-03"
 id: "tutorial6md"
+--->""",
+    7: """<!---
+title: "Tutorial 7"
+metaTitle: "Generative QA"
+metaDescription: ""
+slug: "/docs/tutorial7"
+date: "2020-11-12"
+id: "tutorial7md"
 --->"""
-
 }
diff --git a/docs/_src/usage/usage/document_store.md b/docs/_src/usage/usage/document_store.md
index 1827ca1c5..ff182e123 100644
--- a/docs/_src/usage/usage/document_store.md
+++ b/docs/_src/usage/usage/document_store.md
@@ -79,7 +79,7 @@ See API documentation for more info.
 
 DocumentStores expect Documents in dictionary form, like that below.
 They are loaded using the `DocumentStore.write_documents()` method.
-See [Preprocessing](/docs/latest/preprocessingmd) for more information on how to best prepare your data.
+See [Preprocessing](/docs/latest/preprocessingmd) for more information on the cleaning and splitting steps that will help you maximize Haystack's performance.
 
 [//]: # (Add link to preprocessing section)
 
diff --git a/docs/_src/usage/usage/generator.md b/docs/_src/usage/usage/generator.md
new file mode 100644
index 000000000..9b04b34a2
--- /dev/null
+++ b/docs/_src/usage/usage/generator.md
@@ -0,0 +1,29 @@
+<!---
+title: "Generator"
+metaTitle: "Generator"
+metaDescription: ""
+slug: "/docs/generator"
+date: "2020-11-05"
+id: "generatormd"
+--->
+
+# Generator
+
+See [Tutorial 7](/docs/latest/tutorial7md) for a guide on how to build your own generative QA system.
+
+While extractive QA highlights the span of text that answers a query,
+generative QA can return a novel text answer that it has composed.
+The best current approaches, such as [Retriever-Augmented Generation](https://arxiv.org/abs/2005.11401),
+can draw upon both the knowledge it gained during language model pretraining (parametric memory)
+as well as passages provided to it with a retriever (non-parametric memory).
+With the advent of Transformer based retrieval methods such as [Dense Passage Retrieval](https://arxiv.org/abs/2004.04906),
+retriever and generator can be trained concurrently from the one loss signal.
+
+Pros
+* More appropriately phrased answers
+* Able to syntehsize information from different texts
+* Can draw on latent knowledge stored in language model
+
+Cons
+* Not easy to track what piece of information the generator is basing its response off of
+
diff --git a/docs/_src/usage/usage/intro.md b/docs/_src/usage/usage/intro.md
index 28917abb4..d20372c13 100644
--- a/docs/_src/usage/usage/intro.md
+++ b/docs/_src/usage/usage/intro.md
@@ -24,66 +24,6 @@ and Haystack is designed to be the bridge between research and industry.
 
 * **Domain adaptation**: Fine-tune models to your own domain & improve them continuously via user feedback
 
-## Use cases
-
-### Semantic Search System
-
-Take the leap from using keyword search on your own documents to semantic search with Haystack.
-
-
-* Store your documents in the database of your choice (Elasticsearch, SQL, in memory, FAISS)
-
-
-* Perform question driven queries.
-
-Expect to see results that highlight the very sentence that contains the answer to your question.
-Thanks to the power of Transformer based language models, results are chosen based on compatibility in meaning
-rather than lexical overlap.
-
-
-
-![image](../../img/search.png)
-
-### Information Extractor
-
-Automate the extraction of relevant information from a set of documents that pertain to the same topics but for different entities.
-
-Haystack can:
-
-
-* Apply a set of standard questions to each document in a store
-
-
-* Return a NO_ANSWER if a given document does not contain the answer to a question
-
-Say you have the financial reports for different companies over different years.
-You can gather a set of standard questions which are applicable to each financial report,
-like *what is the revenue forecast for 2020?* or *what are the main sources of income?*.
-Haystack will try to find an answer for each question within each document!
-
-We’ve seen this style of application be particularly effective in the sphere of finance and patent law
-but we see a lot of potential in using this to gain a better overview of academic papers and internal business documents.
-
-<!-- _comment: !!Image!! -->
-### FAQ Style Question Answering
-
-Leverage existing FAQ documents and semantic similarity search to answer new incoming questions.
-The workflow is as follows:
-
-
-* Store a set of FAQ documents in Haystack
-
-
-* The user presents a new question
-
-
-* Haystack will find the closest match to the new question in the FAQ documents
-
-
-* The user will be presented with the most similar Question Answer pair
-
-Haystack’s flexibility allows you to give new users more dynamic access to your existing documentation.
-
 <!-- _comment: !!Image!! -->
 ## Technology
 
diff --git a/docs/_src/usage/usage/optimization.md b/docs/_src/usage/usage/optimization.md
new file mode 100644
index 000000000..226754b46
--- /dev/null
+++ b/docs/_src/usage/usage/optimization.md
@@ -0,0 +1,58 @@
+<!---
+title: "Optimization"
+metaTitle: "Optimization"
+metaDescription: ""
+slug: "/docs/optimization"
+date: "2020-11-05"
+id: "optimizationmd"
+--->
+
+# Optimization
+
+## Document Length
+
+Document length has a very direct impact on the speed of the Reader 
+which is why we recommend using the `PreProcessor` class to clean and split your documents.
+**If you halve the length of your documents, you will halve the workload placed onto your Reader.**
+
+For **sparse retrievers**, very long documents pose a challenge since the signal of the relevant section of text
+can get washed out by the rest of the document.
+We would recommend making sure that **documents are no longer than 10,000 words**.
+
+**Dense retrievers** are limited in the length of text that they can read in one pass.
+As such, it is important that documents are not longer than the dense retriever's maximum input length.
+By default, Haystack's DensePassageRetriever model has a maximum length of 256 tokens.
+As such, we recommend that documents contain significantly less words.
+We have found decent performance with **documents around 100 words long**.
+
+## Respecting Sentence Boundaries
+
+When splitting documents, it is generally not a good idea to let document boundaries fall in the middle of sentences. 
+Doing so means that each document will contain incomplete sentence fragments 
+which maybe be hard for both retriever and reader to interpret.
+It is therefore recommended to set `split_respect_sentence_boundary=True` when initializing your `PreProcessor`.
+
+## Choosing the Right top-k Values
+
+The `top-k` parameter in both the `Retriever` and `Reader` determine how many results they return.
+More specifically, `Retriever` `top-k` dictates how many retrieved documents are passed on to the next stage,
+while `Reader` `top-k` determines how many answer candidates to show.
+
+In our experiments, we have found that **`Retriever` `top_k=10`
+gives decent overall performance** and so we have set this as the default in Haystack.
+
+The choice of `Retriever` `top-k` is a trade-off between speed and accuracy, 
+especially when there is a `Reader` in the pipeline.
+Setting it higher means passing more documents to the `Reader`, 
+thus reducing the chance that the answer-containing passage is missed.
+However, passing more documents to the `Reader` will create a larger workload for the component.
+
+These parameters can easily be tweaked as follows if using a `Finder`:
+``` python
+answers = finder.get_answers(retriever_top_k=10,
+                             reader_top_k=5)
+```
+or like this if directly calling the `Retriever`:
+``` python
+retrieved_docs = retriever.retrieve(top_k=10)
+```
diff --git a/docs/_src/usage/usage/preprocessing.md b/docs/_src/usage/usage/preprocessing.md
index 1faa8be7d..ee49db7ed 100644
--- a/docs/_src/usage/usage/preprocessing.md
+++ b/docs/_src/usage/usage/preprocessing.md
@@ -90,6 +90,8 @@ it is recommended that they are further processed in order to ensure optimal Ret
 The `PreProcessor` takes one of the documents created by the converter as input,
 performs various cleaning steps and splits them into multiple smaller documents.
 
+For suggestions on how best to split your documents, see [Optimization](/docs/latest/optimizationmd)
+
 ```python
 doc = converter.convert(file_path=file, meta=None)
 processor = PreProcessor(clean_empty_lines=True,
@@ -107,29 +109,3 @@ docs = processor.process(d)
 * `split_by` determines what unit the document is split by: `'word'`, `'sentence'` or `'passage'`
 * `split_length` sets a maximum number of `'word'`, `'sentence'` or `'passage'` units per output document
 * `split_respect_sentence_boundary` ensures that document boundaries do not fall in the middle of sentences
-
-## Impact of Document Splitting
-
-The File Converters will treat each file as a single document regardless of length.
-This is not always ideal as long documents can have negative impacts on both speed and accuracy.
-
-Document length has a very direct impact on the speed of the Reader.
-**If you halve the length of your documents, you can expect that the Reader will double in speed.**
-
-It is generally not a good idea to let document boundaries fall in the middle of sentences. 
-Doing so means that each document will contain incomplete sentence fragments 
-which maybe be hard for both retriever and reader to interpret.
-
-For **sparse retrievers**, very long documents pose a challenge since the signal of the relevant section of text
-can get washed out by the rest of the document.
-We would recommend making sure that **documents are no longer than 10,000 words**.
-
-**Dense retrievers** are limited in the length of text that they can read in one pass.
-As such, it is important that documents are not longer than the dense retriever's maximum input length.
-By default, Haystack's DensePassageRetriever model has a maximum length of 256 tokens.
-As such, we recommend that documents contain significantly less words.
-We have found decent performance with **documents around 100 words long**.
-
-
-
-
diff --git a/docs/_src/usage/usage/retriever.md b/docs/_src/usage/usage/retriever.md
index 363095e0a..b4bf42cff 100644
--- a/docs/_src/usage/usage/retriever.md
+++ b/docs/_src/usage/usage/retriever.md
@@ -32,6 +32,8 @@ Here are the combinations which are supported:
 | Embedding | Y | Y | N | Y |
 | DPR | Y | Y | N | Y |
 
+See [Optimization](/docs/latest/optimizationmd) for suggestions on how to choose top-k values.
+
 ## TF-IDF
 
 ### Description
diff --git a/docs/_src/usage/usage/terms.md b/docs/_src/usage/usage/terms.md
index cb98a53c4..d9b7be75d 100644
--- a/docs/_src/usage/usage/terms.md
+++ b/docs/_src/usage/usage/terms.md
@@ -44,6 +44,7 @@ In question answering models (and hence in Haystack Readers), this is usually a
 **Question Answering (QA)** - A popular task in the world of NLP where systems have to find answers to questions.
 The term is generally used to refer to extractive question answering,
 where a system has to find the minimal text span in a given document that contains the answer to the question.
+Note however, that it may also refer to abstractive question answering or FAQ matching.
 
 **Reader** - The component in Haystack that does the closest reading of a document to extract
 the exact text which answers a question.
diff --git a/docs/_src/usage/usage/use_cases.md b/docs/_src/usage/usage/use_cases.md
new file mode 100644
index 000000000..49311328c
--- /dev/null
+++ b/docs/_src/usage/usage/use_cases.md
@@ -0,0 +1,69 @@
+<!---
+title: "Use Cases"
+metaTitle: "Use Cases"
+metaDescription: ""
+slug: "/docs/use_cases"
+date: "2020-11-16"
+id: "use_casesmd"
+--->
+
+
+# Use cases
+
+## Semantic Search System
+
+Take the leap from using keyword search on your own documents to semantic search with Haystack.
+
+
+* Store your documents in the database of your choice (Elasticsearch, SQL, in memory, FAISS)
+
+
+* Perform question driven queries.
+
+Expect to see results that highlight the very sentence that contains the answer to your question.
+Thanks to the power of Transformer based language models, results are chosen based on compatibility in meaning
+rather than lexical overlap.
+
+
+
+![image](../../img/search.png)
+
+## Information Extractor
+
+Automate the extraction of relevant information from a set of documents that pertain to the same topics but for different entities.
+
+Haystack can:
+
+
+* Apply a set of standard questions to each document in a store
+
+
+* Return a NO_ANSWER if a given document does not contain the answer to a question
+
+Say you have the financial reports for different companies over different years.
+You can gather a set of standard questions which are applicable to each financial report,
+like *what is the revenue forecast for 2020?* or *what are the main sources of income?*.
+Haystack will try to find an answer for each question within each document!
+
+We’ve seen this style of application be particularly effective in the sphere of finance and patent law
+but we see a lot of potential in using this to gain a better overview of academic papers and internal business documents.
+
+<!-- _comment: !!Image!! -->
+## FAQ Style Question Answering
+
+Leverage existing FAQ documents and semantic similarity search to answer new incoming questions.
+The workflow is as follows:
+
+
+* Store a set of FAQ documents in Haystack
+
+
+* The user presents a new question
+
+
+* Haystack will find the closest match to the new question in the FAQ documents
+
+
+* The user will be presented with the most similar Question Answer pair
+
+Haystack’s flexibility allows you to give new users more dynamic access to your existing documentation.
diff --git a/tutorials/Tutorial7_RAG_Generator.ipynb b/tutorials/Tutorial7_RAG_Generator.ipynb
index 80a92c757..67167d26a 100644
--- a/tutorials/Tutorial7_RAG_Generator.ipynb
+++ b/tutorials/Tutorial7_RAG_Generator.ipynb
@@ -1,23 +1,49 @@
 {
  "cells": [
   {
-   "cell_type": "code",
+   "cell_type": "markdown",
+   "source": [
+    "# Generative QA with \"Retrieval-Augmented Generation\"\n",
+    "\n",
+    "While extractive QA highlights the span of text that answers a query,\n",
+    "generative QA can return a novel text answer that it has composed.\n",
+    "In this tutorial, you will learn how to set up a generative system using the\n",
+    "[RAG model](https://arxiv.org/abs/2005.11401) which conditions the\n",
+    "answer generator on a set of retrieved documents."
+   ],
    "metadata": {
-    "id": "iDyfhfyp7Sjh"
-   },
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Here are the packages and imports that we'll need:"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
    "source": [
     "!pip install git+https://github.com/deepset-ai/haystack.git\n",
     "!pip install urllib3==1.25.4\n",
     "!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html\n"
    ],
-   "execution_count": null,
-   "outputs": []
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
   },
   {
    "cell_type": "code",
-   "metadata": {
-    "id": "ICZanGLa7khF"
-   },
+   "execution_count": null,
+   "outputs": [],
    "source": [
     "from typing import List\n",
     "import requests\n",
@@ -27,32 +53,63 @@
     "from haystack.generator.transformers import RAGenerator\n",
     "from haystack.retriever.dense import DensePassageRetriever"
    ],
-   "execution_count": null,
-   "outputs": []
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Let's download a csv containing some sample text and preprocess the data.\n"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
   },
   {
    "cell_type": "code",
-   "metadata": {
-    "id": "D3f-CQ4c7lEN"
-   },
+   "execution_count": null,
+   "outputs": [],
    "source": [
-    "# Add documents from which you want generate answers\n",
-    "# Download a csv containing some sample documents data\n",
-    "# Here some sample documents data\n",
+    "# Download sample\n",
     "temp = requests.get(\"https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv\")\n",
     "open('small_generator_dataset.csv', 'wb').write(temp.content)\n",
     "\n",
-    "# Get dataframe with columns \"title\", and \"text\"\n",
+    "# Create dataframe with columns \"title\" and \"text\"\n",
     "df = pd.read_csv(\"small_generator_dataset.csv\", sep=',')\n",
     "# Minimal cleaning\n",
     "df.fillna(value=\"\", inplace=True)\n",
     "\n",
-    "print(df.head())\n",
-    "\n",
-    "# Create to haystack document format\n",
+    "print(df.head())"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We can cast our data into Haystack Document objects.\n",
+    "Alternatively, we can also just use dictionaries with \"text\" and \"meta\" fields"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# Use data to initialize Document objects\n",
     "titles = list(df[\"title\"].values)\n",
     "texts = list(df[\"text\"].values)\n",
-    "\n",
     "documents: List[Document] = []\n",
     "for title, text in zip(titles, texts):\n",
     "    documents.append(\n",
@@ -64,16 +121,29 @@
     "        )\n",
     "    )"
    ],
-   "execution_count": null,
-   "outputs": []
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Here we initialize the FAISSDocumentStore, DensePassageRetriever and RAGenerator.\n",
+    "FAISS is chosen here since it is optimized vector storage."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
   },
   {
    "cell_type": "code",
-   "metadata": {
-    "id": "upRu3ebX7nr_"
-   },
+   "execution_count": null,
+   "outputs": [],
    "source": [
-    "# Initialize FAISS document store to documents and corresponding index for embeddings\n",
+    "# Initialize FAISS document store.\n",
     "# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding\n",
     "document_store = FAISSDocumentStore(\n",
     "    faiss_index_factory_str=\"Flat\",\n",
@@ -100,37 +170,60 @@
     "    num_beams=2,\n",
     ")"
    ],
-   "execution_count": null,
-   "outputs": []
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We write documents to the DocumentStore, first by deleting any remaining documents then calling `write_documents()`.\n",
+    "The `update_embeddings()` method uses the retriever to create an embedding for each document.\n"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
   },
   {
    "cell_type": "code",
-   "metadata": {
-    "id": "as8j7hkW7rOW"
-   },
+   "execution_count": null,
+   "outputs": [],
    "source": [
     "# Delete existing documents in documents store\n",
     "document_store.delete_all_documents()\n",
+    "\n",
     "# Write documents to document store\n",
     "document_store.write_documents(documents)\n",
+    "\n",
     "# Add documents embeddings to index\n",
     "document_store.update_embeddings(\n",
     "    retriever=retriever\n",
     ")"
    ],
-   "execution_count": null,
-   "outputs": []
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Here are our questions:"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
   },
   {
    "cell_type": "code",
-   "metadata": {
-    "id": "j8It45R872vb",
-    "cellView": "form"
-   },
+   "execution_count": null,
+   "outputs": [],
    "source": [
-    "#@title\n",
-    "# Now ask your questions\n",
-    "# We have some sample questions\n",
     "QUESTIONS = [\n",
     "    \"who got the first nobel prize in physics\",\n",
     "    \"when is the next deadpool movie being released\",\n",
@@ -149,16 +242,31 @@
     "    \"what is the name of manchester united stadium\",\n",
     "]"
    ],
-   "execution_count": null,
-   "outputs": []
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Now let's run our system!\n",
+    "The retriever will pick out a small subset of documents that it finds relevant.\n",
+    "These are used to condition the generator as it generates the answer.\n",
+    "What it should return then are novel text spans that form and answer to your question!"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
   },
   {
    "cell_type": "code",
-   "metadata": {
-    "id": "xPUHRuTP742h"
-   },
+   "execution_count": null,
+   "outputs": [],
    "source": [
-    "# Now generate answer for question\n",
+    "# Now generate an answer for each question\n",
     "for question in QUESTIONS:\n",
     "    # Retrieve related documents from retriever\n",
     "    retriever_results = retriever.retrieve(\n",
@@ -176,19 +284,31 @@
     "    answers = predicted_result[\"answers\"]\n",
     "    print(f'Generated answer is \\'{answers[0][\"answer\"]}\\' for the question = \\'{question}\\'')"
    ],
-   "execution_count": null,
-   "outputs": []
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
   }
  ],
  "metadata": {
-  "colab": {
-   "name": "Tutorial7_RAG_Generator.ipynb",
-   "provenance": [],
-   "collapsed_sections": []
-  },
   "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3"
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
   }
  },
  "nbformat": 4,