From c07182aa0ab77106cdb142f4ca43ff02476e6fbf Mon Sep 17 00:00:00 2001
From: brandenchan <brandenchan@icloud.com>
Date: Thu, 12 Nov 2020 12:07:02 +0100
Subject: [PATCH] First batch of changes

---
 docs/_src/tutorials/tutorials/1.md       |   6 +-
 docs/_src/tutorials/tutorials/2.md       |   6 +-
 docs/_src/tutorials/tutorials/3.md       |   6 +-
 docs/_src/tutorials/tutorials/4.md       |  19 +--
 docs/_src/tutorials/tutorials/5.md       |  14 ++-
 docs/_src/tutorials/tutorials/6.md       |  13 +-
 docs/_src/tutorials/tutorials/7.md       | 144 +++++++++++++++++++++++
 docs/_src/tutorials/tutorials/headers.py |   9 +-
 docs/_src/usage/usage/document_store.md  |   2 +-
 docs/_src/usage/usage/generator.md       |  11 ++
 docs/_src/usage/usage/optimization.md    |  19 +++
 docs/_src/usage/usage/terms.md           |   1 +
 12 files changed, 222 insertions(+), 28 deletions(-)
 create mode 100644 docs/_src/tutorials/tutorials/7.md
 create mode 100644 docs/_src/usage/usage/generator.md
 create mode 100644 docs/_src/usage/usage/optimization.md

diff --git a/docs/_src/tutorials/tutorials/1.md b/docs/_src/tutorials/tutorials/1.md
index 92faee639..d7ffdb814 100644
--- a/docs/_src/tutorials/tutorials/1.md
+++ b/docs/_src/tutorials/tutorials/1.md
@@ -28,9 +28,11 @@ marvellous seven kingdoms...
 # Install the latest release of Haystack in your own environment 
 #! pip install farm-haystack
 
-# Install the latest master of Haystack and install the version of torch that works with the colab GPUs
+# Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
-!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
 ```
 
 
diff --git a/docs/_src/tutorials/tutorials/2.md b/docs/_src/tutorials/tutorials/2.md
index 89c053c3f..c3bc0433b 100644
--- a/docs/_src/tutorials/tutorials/2.md
+++ b/docs/_src/tutorials/tutorials/2.md
@@ -22,9 +22,11 @@ This tutorial shows you how to fine-tune a pretrained model on your own dataset.
 # Install the latest release of Haystack in your own environment 
 #! pip install farm-haystack
 
-# Install the latest master of Haystack and install the version of torch that works with the colab GPUs
+# Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
-!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
 ```
 
 
diff --git a/docs/_src/tutorials/tutorials/3.md b/docs/_src/tutorials/tutorials/3.md
index 2a42ae103..ebd353bae 100644
--- a/docs/_src/tutorials/tutorials/3.md
+++ b/docs/_src/tutorials/tutorials/3.md
@@ -22,9 +22,11 @@ If you are interested in more feature-rich Elasticsearch, then please refer to t
 # Install the latest release of Haystack in your own environment 
 #! pip install farm-haystack
 
-# Install the latest master of Haystack and install the version of torch that works with the colab GPUs
+# Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
-!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
 ```
 
 
diff --git a/docs/_src/tutorials/tutorials/4.md b/docs/_src/tutorials/tutorials/4.md
index 28f9025d6..9a8dff48f 100644
--- a/docs/_src/tutorials/tutorials/4.md
+++ b/docs/_src/tutorials/tutorials/4.md
@@ -31,9 +31,11 @@ In some use cases, a combination of extractive QA and FAQ-style can also be an i
 # Install the latest release of Haystack in your own environment 
 #! pip install farm-haystack
 
-# Install the latest master of Haystack and install the version of torch that works with the colab GPUs
+# Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
-!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
 ```
 
 
@@ -54,19 +56,19 @@ You can start Elasticsearch on your local machine instance using Docker. If Dock
 
 ```python
 # Recommended: Start Elasticsearch using Docker
-# ! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2
+# ! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2
 ```
 
 
 ```python
 # In Colab / No Docker environments: Start Elasticsearch from source
-! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q
-! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz
-! chown -R daemon:daemon elasticsearch-7.6.2
+! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+! chown -R daemon:daemon elasticsearch-7.9.2
 
 import os
 from subprocess import Popen, PIPE, STDOUT
-es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],
+es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
                    stdout=PIPE, stderr=STDOUT,
                    preexec_fn=lambda: os.setuid(1)  # as daemon
                   )
@@ -98,7 +100,7 @@ We can use the `EmbeddingRetriever` for this purpose and specify a model that we
 
 
 ```python
-retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=False)
+retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=True)
 ```
 
 ### Prepare & Index FAQ data
@@ -121,7 +123,6 @@ print(df.head())
 # Get embeddings for our questions from the FAQs
 questions = list(df["question"].values)
 df["question_emb"] = retriever.embed_queries(texts=questions)
-df["question_emb"] = df["question_emb"].apply(list) # convert from numpy to list for ES ingestion
 df = df.rename(columns={"answer": "text"})
 
 # Convert Dataframe to list of dicts and index them in our DocumentStore
diff --git a/docs/_src/tutorials/tutorials/5.md b/docs/_src/tutorials/tutorials/5.md
index 85475f202..1d6fe4274 100644
--- a/docs/_src/tutorials/tutorials/5.md
+++ b/docs/_src/tutorials/tutorials/5.md
@@ -21,21 +21,23 @@ You can start Elasticsearch on your local machine instance using Docker. If Dock
 # Install the latest release of Haystack in your own environment 
 #! pip install farm-haystack
 
-# Install the latest master of Haystack and install the version of torch that works with the colab GPUs
+# Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
-!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
 ```
 
 
 ```python
 # In Colab / No Docker environments: Start Elasticsearch from source
-! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q
-! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz
-! chown -R daemon:daemon elasticsearch-7.6.2
+! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
+! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
+! chown -R daemon:daemon elasticsearch-7.9.2
 
 import os
 from subprocess import Popen, PIPE, STDOUT
-es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],
+es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
                    stdout=PIPE, stderr=STDOUT,
                    preexec_fn=lambda: os.setuid(1)  # as daemon
                   )
diff --git a/docs/_src/tutorials/tutorials/6.md b/docs/_src/tutorials/tutorials/6.md
index ec426daad..9de9b0c3d 100644
--- a/docs/_src/tutorials/tutorials/6.md
+++ b/docs/_src/tutorials/tutorials/6.md
@@ -77,9 +77,11 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
 # Install the latest release of Haystack in your own environment 
 #! pip install farm-haystack
 
-# Install the latest master of Haystack and install the version of torch that works with the colab GPUs
+# Install the latest master of Haystack
 !pip install git+https://github.com/deepset-ai/haystack.git
-!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
 ```
 
 
@@ -142,11 +144,12 @@ from haystack.retriever.dense import DensePassageRetriever
 retriever = DensePassageRetriever(document_store=document_store,
                                   query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                   passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
+                                  max_seq_len_query=64,
+                                  max_seq_len_passage=256,
+                                  batch_size=16,
                                   use_gpu=True,
                                   embed_title=True,
-                                  max_seq_len=256,
-                                  batch_size=16,
-                                  remove_sep_tok_from_untitled_passages=True)
+                                  use_fast_tokenizers=True)
 # Important: 
 # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
 # previously indexed documents and update their embedding representation. 
diff --git a/docs/_src/tutorials/tutorials/7.md b/docs/_src/tutorials/tutorials/7.md
new file mode 100644
index 000000000..1eb59dad1
--- /dev/null
+++ b/docs/_src/tutorials/tutorials/7.md
@@ -0,0 +1,144 @@
+<!---
+title: "Tutorial 7"
+metaTitle: "Generative QA"
+metaDescription: ""
+slug: "/docs/tutorial7"
+date: "2020-11-12"
+id: "tutorial7md"
+--->
+
+```
+!pip install git+https://github.com/deepset-ai/haystack.git
+!pip install urllib3==1.25.4
+!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
+```
+
+
+```
+from typing import List
+import requests
+import pandas as pd
+from haystack import Document
+from haystack.document_store.faiss import FAISSDocumentStore
+from haystack.generator.transformers import RAGenerator
+from haystack.retriever.dense import DensePassageRetriever
+```
+
+
+```
+# Add documents from which you want generate answers
+# Download a csv containing some sample documents data
+# Here some sample documents data
+temp = requests.get("https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv")
+open('small_generator_dataset.csv', 'wb').write(temp.content)
+
+# Get dataframe with columns "title", and "text"
+df = pd.read_csv("small_generator_dataset.csv", sep=',')
+# Minimal cleaning
+df.fillna(value="", inplace=True)
+
+print(df.head())
+
+# Create to haystack document format
+titles = list(df["title"].values)
+texts = list(df["text"].values)
+
+documents: List[Document] = []
+for title, text in zip(titles, texts):
+    documents.append(
+        Document(
+            text=text,
+            meta={
+                "name": title or ""
+            }
+        )
+    )
+```
+
+
+```
+# Initialize FAISS document store to documents and corresponding index for embeddings
+# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
+document_store = FAISSDocumentStore(
+    faiss_index_factory_str="Flat",
+    return_embedding=True
+)
+
+# Initialize DPR Retriever to encode documents, encode question and query documents
+retriever = DensePassageRetriever(
+    document_store=document_store,
+    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
+    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
+    use_gpu=False,
+    embed_title=True,
+)
+
+# Initialize RAG Generator
+generator = RAGenerator(
+    model_name_or_path="facebook/rag-token-nq",
+    use_gpu=False,
+    top_k_answers=1,
+    max_length=200,
+    min_length=2,
+    embed_title=True,
+    num_beams=2,
+)
+```
+
+
+```
+# Delete existing documents in documents store
+document_store.delete_all_documents()
+# Write documents to document store
+document_store.write_documents(documents)
+# Add documents embeddings to index
+document_store.update_embeddings(
+    retriever=retriever
+)
+```
+
+
+```
+#@title
+# Now ask your questions
+# We have some sample questions
+QUESTIONS = [
+    "who got the first nobel prize in physics",
+    "when is the next deadpool movie being released",
+    "which mode is used for short wave broadcast service",
+    "who is the owner of reading football club",
+    "when is the next scandal episode coming out",
+    "when is the last time the philadelphia won the superbowl",
+    "what is the most current adobe flash player version",
+    "how many episodes are there in dragon ball z",
+    "what is the first step in the evolution of the eye",
+    "where is gall bladder situated in human body",
+    "what is the main mineral in lithium batteries",
+    "who is the president of usa right now",
+    "where do the greasers live in the outsiders",
+    "panda is a national animal of which country",
+    "what is the name of manchester united stadium",
+]
+```
+
+
+```
+# Now generate answer for question
+for question in QUESTIONS:
+    # Retrieve related documents from retriever
+    retriever_results = retriever.retrieve(
+        query=question
+    )
+
+    # Now generate answer from question and retrieved documents
+    predicted_result = generator.predict(
+        question=question,
+        documents=retriever_results,
+        top_k=1
+    )
+
+    # Print you answer
+    answers = predicted_result["answers"]
+    print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')
+```
diff --git a/docs/_src/tutorials/tutorials/headers.py b/docs/_src/tutorials/tutorials/headers.py
index b09e2a4fc..a7cb8c221 100644
--- a/docs/_src/tutorials/tutorials/headers.py
+++ b/docs/_src/tutorials/tutorials/headers.py
@@ -46,6 +46,13 @@ metaDescription: ""
 slug: "/docs/tutorial6"
 date: "2020-09-03"
 id: "tutorial6md"
+--->""",
+    7: """<!---
+title: "Tutorial 7"
+metaTitle: "Generative QA"
+metaDescription: ""
+slug: "/docs/tutorial7"
+date: "2020-11-12"
+id: "tutorial7md"
 --->"""
-
 }
diff --git a/docs/_src/usage/usage/document_store.md b/docs/_src/usage/usage/document_store.md
index 1827ca1c5..ff182e123 100644
--- a/docs/_src/usage/usage/document_store.md
+++ b/docs/_src/usage/usage/document_store.md
@@ -79,7 +79,7 @@ See API documentation for more info.
 
 DocumentStores expect Documents in dictionary form, like that below.
 They are loaded using the `DocumentStore.write_documents()` method.
-See [Preprocessing](/docs/latest/preprocessingmd) for more information on how to best prepare your data.
+See [Preprocessing](/docs/latest/preprocessingmd) for more information on the cleaning and splitting steps that will help you maximize Haystack's performance.
 
 [//]: # (Add link to preprocessing section)
 
diff --git a/docs/_src/usage/usage/generator.md b/docs/_src/usage/usage/generator.md
new file mode 100644
index 000000000..a899a11c3
--- /dev/null
+++ b/docs/_src/usage/usage/generator.md
@@ -0,0 +1,11 @@
+<!---
+title: "Generator"
+metaTitle: "Generator"
+metaDescription: ""
+slug: "/docs/generator"
+date: "2020-11-05"
+id: "generatormd"
+--->
+
+# Generator
+
diff --git a/docs/_src/usage/usage/optimization.md b/docs/_src/usage/usage/optimization.md
new file mode 100644
index 000000000..50d5e57ec
--- /dev/null
+++ b/docs/_src/usage/usage/optimization.md
@@ -0,0 +1,19 @@
+<!---
+title: "Optimization"
+metaTitle: "Optimization"
+metaDescription: ""
+slug: "/docs/optimization"
+date: "2020-11-05"
+id: "optimizationmd"
+--->
+
+# Optimization
+
+Cleaning
+Splitting
+ES Language
+
+top-k Recommend 10 - 5
+
+batch size / gpu
+Doc stride / Max seq len
\ No newline at end of file
diff --git a/docs/_src/usage/usage/terms.md b/docs/_src/usage/usage/terms.md
index cb98a53c4..d9b7be75d 100644
--- a/docs/_src/usage/usage/terms.md
+++ b/docs/_src/usage/usage/terms.md
@@ -44,6 +44,7 @@ In question answering models (and hence in Haystack Readers), this is usually a
 **Question Answering (QA)** - A popular task in the world of NLP where systems have to find answers to questions.
 The term is generally used to refer to extractive question answering,
 where a system has to find the minimal text span in a given document that contains the answer to the question.
+Note however, that it may also refer to abstractive question answering or FAQ matching.
 
 **Reader** - The component in Haystack that does the closest reading of a document to extract
 the exact text which answers a question.