From c07182aa0ab77106cdb142f4ca43ff02476e6fbf Mon Sep 17 00:00:00 2001 From: brandenchan Date: Thu, 12 Nov 2020 12:07:02 +0100 Subject: [PATCH] First batch of changes --- docs/_src/tutorials/tutorials/1.md | 6 +- docs/_src/tutorials/tutorials/2.md | 6 +- docs/_src/tutorials/tutorials/3.md | 6 +- docs/_src/tutorials/tutorials/4.md | 19 +-- docs/_src/tutorials/tutorials/5.md | 14 ++- docs/_src/tutorials/tutorials/6.md | 13 +- docs/_src/tutorials/tutorials/7.md | 144 +++++++++++++++++++++++ docs/_src/tutorials/tutorials/headers.py | 9 +- docs/_src/usage/usage/document_store.md | 2 +- docs/_src/usage/usage/generator.md | 11 ++ docs/_src/usage/usage/optimization.md | 19 +++ docs/_src/usage/usage/terms.md | 1 + 12 files changed, 222 insertions(+), 28 deletions(-) create mode 100644 docs/_src/tutorials/tutorials/7.md create mode 100644 docs/_src/usage/usage/generator.md create mode 100644 docs/_src/usage/usage/optimization.md diff --git a/docs/_src/tutorials/tutorials/1.md b/docs/_src/tutorials/tutorials/1.md index 92faee639..d7ffdb814 100644 --- a/docs/_src/tutorials/tutorials/1.md +++ b/docs/_src/tutorials/tutorials/1.md @@ -28,9 +28,11 @@ marvellous seven kingdoms... # Install the latest release of Haystack in your own environment #! pip install farm-haystack -# Install the latest master of Haystack and install the version of torch that works with the colab GPUs +# Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + ``` diff --git a/docs/_src/tutorials/tutorials/2.md b/docs/_src/tutorials/tutorials/2.md index 89c053c3f..c3bc0433b 100644 --- a/docs/_src/tutorials/tutorials/2.md +++ b/docs/_src/tutorials/tutorials/2.md @@ -22,9 +22,11 @@ This tutorial shows you how to fine-tune a pretrained model on your own dataset. # Install the latest release of Haystack in your own environment #! pip install farm-haystack -# Install the latest master of Haystack and install the version of torch that works with the colab GPUs +# Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + ``` diff --git a/docs/_src/tutorials/tutorials/3.md b/docs/_src/tutorials/tutorials/3.md index 2a42ae103..ebd353bae 100644 --- a/docs/_src/tutorials/tutorials/3.md +++ b/docs/_src/tutorials/tutorials/3.md @@ -22,9 +22,11 @@ If you are interested in more feature-rich Elasticsearch, then please refer to t # Install the latest release of Haystack in your own environment #! pip install farm-haystack -# Install the latest master of Haystack and install the version of torch that works with the colab GPUs +# Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + ``` diff --git a/docs/_src/tutorials/tutorials/4.md b/docs/_src/tutorials/tutorials/4.md index 28f9025d6..9a8dff48f 100644 --- a/docs/_src/tutorials/tutorials/4.md +++ b/docs/_src/tutorials/tutorials/4.md @@ -31,9 +31,11 @@ In some use cases, a combination of extractive QA and FAQ-style can also be an i # Install the latest release of Haystack in your own environment #! pip install farm-haystack -# Install the latest master of Haystack and install the version of torch that works with the colab GPUs +# Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + ``` @@ -54,19 +56,19 @@ You can start Elasticsearch on your local machine instance using Docker. If Dock ```python # Recommended: Start Elasticsearch using Docker -# ! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2 +# ! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2 ``` ```python # In Colab / No Docker environments: Start Elasticsearch from source -! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q -! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz -! chown -R daemon:daemon elasticsearch-7.6.2 +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 import os from subprocess import Popen, PIPE, STDOUT -es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'], +es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon ) @@ -98,7 +100,7 @@ We can use the `EmbeddingRetriever` for this purpose and specify a model that we ```python -retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=False) +retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=True) ``` ### Prepare & Index FAQ data @@ -121,7 +123,6 @@ print(df.head()) # Get embeddings for our questions from the FAQs questions = list(df["question"].values) df["question_emb"] = retriever.embed_queries(texts=questions) -df["question_emb"] = df["question_emb"].apply(list) # convert from numpy to list for ES ingestion df = df.rename(columns={"answer": "text"}) # Convert Dataframe to list of dicts and index them in our DocumentStore diff --git a/docs/_src/tutorials/tutorials/5.md b/docs/_src/tutorials/tutorials/5.md index 85475f202..1d6fe4274 100644 --- a/docs/_src/tutorials/tutorials/5.md +++ b/docs/_src/tutorials/tutorials/5.md @@ -21,21 +21,23 @@ You can start Elasticsearch on your local machine instance using Docker. If Dock # Install the latest release of Haystack in your own environment #! pip install farm-haystack -# Install the latest master of Haystack and install the version of torch that works with the colab GPUs +# Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + ``` ```python # In Colab / No Docker environments: Start Elasticsearch from source -! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q -! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz -! chown -R daemon:daemon elasticsearch-7.6.2 +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 import os from subprocess import Popen, PIPE, STDOUT -es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'], +es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon ) diff --git a/docs/_src/tutorials/tutorials/6.md b/docs/_src/tutorials/tutorials/6.md index ec426daad..9de9b0c3d 100644 --- a/docs/_src/tutorials/tutorials/6.md +++ b/docs/_src/tutorials/tutorials/6.md @@ -77,9 +77,11 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial # Install the latest release of Haystack in your own environment #! pip install farm-haystack -# Install the latest master of Haystack and install the version of torch that works with the colab GPUs +# Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + ``` @@ -142,11 +144,12 @@ from haystack.retriever.dense import DensePassageRetriever retriever = DensePassageRetriever(document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", + max_seq_len_query=64, + max_seq_len_passage=256, + batch_size=16, use_gpu=True, embed_title=True, - max_seq_len=256, - batch_size=16, - remove_sep_tok_from_untitled_passages=True) + use_fast_tokenizers=True) # Important: # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all # previously indexed documents and update their embedding representation. diff --git a/docs/_src/tutorials/tutorials/7.md b/docs/_src/tutorials/tutorials/7.md new file mode 100644 index 000000000..1eb59dad1 --- /dev/null +++ b/docs/_src/tutorials/tutorials/7.md @@ -0,0 +1,144 @@ + + +``` +!pip install git+https://github.com/deepset-ai/haystack.git +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + +``` + + +``` +from typing import List +import requests +import pandas as pd +from haystack import Document +from haystack.document_store.faiss import FAISSDocumentStore +from haystack.generator.transformers import RAGenerator +from haystack.retriever.dense import DensePassageRetriever +``` + + +``` +# Add documents from which you want generate answers +# Download a csv containing some sample documents data +# Here some sample documents data +temp = requests.get("https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv") +open('small_generator_dataset.csv', 'wb').write(temp.content) + +# Get dataframe with columns "title", and "text" +df = pd.read_csv("small_generator_dataset.csv", sep=',') +# Minimal cleaning +df.fillna(value="", inplace=True) + +print(df.head()) + +# Create to haystack document format +titles = list(df["title"].values) +texts = list(df["text"].values) + +documents: List[Document] = [] +for title, text in zip(titles, texts): + documents.append( + Document( + text=text, + meta={ + "name": title or "" + } + ) + ) +``` + + +``` +# Initialize FAISS document store to documents and corresponding index for embeddings +# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding +document_store = FAISSDocumentStore( + faiss_index_factory_str="Flat", + return_embedding=True +) + +# Initialize DPR Retriever to encode documents, encode question and query documents +retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="facebook/dpr-question_encoder-single-nq-base", + passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", + use_gpu=False, + embed_title=True, +) + +# Initialize RAG Generator +generator = RAGenerator( + model_name_or_path="facebook/rag-token-nq", + use_gpu=False, + top_k_answers=1, + max_length=200, + min_length=2, + embed_title=True, + num_beams=2, +) +``` + + +``` +# Delete existing documents in documents store +document_store.delete_all_documents() +# Write documents to document store +document_store.write_documents(documents) +# Add documents embeddings to index +document_store.update_embeddings( + retriever=retriever +) +``` + + +``` +#@title +# Now ask your questions +# We have some sample questions +QUESTIONS = [ + "who got the first nobel prize in physics", + "when is the next deadpool movie being released", + "which mode is used for short wave broadcast service", + "who is the owner of reading football club", + "when is the next scandal episode coming out", + "when is the last time the philadelphia won the superbowl", + "what is the most current adobe flash player version", + "how many episodes are there in dragon ball z", + "what is the first step in the evolution of the eye", + "where is gall bladder situated in human body", + "what is the main mineral in lithium batteries", + "who is the president of usa right now", + "where do the greasers live in the outsiders", + "panda is a national animal of which country", + "what is the name of manchester united stadium", +] +``` + + +``` +# Now generate answer for question +for question in QUESTIONS: + # Retrieve related documents from retriever + retriever_results = retriever.retrieve( + query=question + ) + + # Now generate answer from question and retrieved documents + predicted_result = generator.predict( + question=question, + documents=retriever_results, + top_k=1 + ) + + # Print you answer + answers = predicted_result["answers"] + print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'') +``` diff --git a/docs/_src/tutorials/tutorials/headers.py b/docs/_src/tutorials/tutorials/headers.py index b09e2a4fc..a7cb8c221 100644 --- a/docs/_src/tutorials/tutorials/headers.py +++ b/docs/_src/tutorials/tutorials/headers.py @@ -46,6 +46,13 @@ metaDescription: "" slug: "/docs/tutorial6" date: "2020-09-03" id: "tutorial6md" +--->""", + 7: """""" - } diff --git a/docs/_src/usage/usage/document_store.md b/docs/_src/usage/usage/document_store.md index 1827ca1c5..ff182e123 100644 --- a/docs/_src/usage/usage/document_store.md +++ b/docs/_src/usage/usage/document_store.md @@ -79,7 +79,7 @@ See API documentation for more info. DocumentStores expect Documents in dictionary form, like that below. They are loaded using the `DocumentStore.write_documents()` method. -See [Preprocessing](/docs/latest/preprocessingmd) for more information on how to best prepare your data. +See [Preprocessing](/docs/latest/preprocessingmd) for more information on the cleaning and splitting steps that will help you maximize Haystack's performance. [//]: # (Add link to preprocessing section) diff --git a/docs/_src/usage/usage/generator.md b/docs/_src/usage/usage/generator.md new file mode 100644 index 000000000..a899a11c3 --- /dev/null +++ b/docs/_src/usage/usage/generator.md @@ -0,0 +1,11 @@ + + +# Generator + diff --git a/docs/_src/usage/usage/optimization.md b/docs/_src/usage/usage/optimization.md new file mode 100644 index 000000000..50d5e57ec --- /dev/null +++ b/docs/_src/usage/usage/optimization.md @@ -0,0 +1,19 @@ + + +# Optimization + +Cleaning +Splitting +ES Language + +top-k Recommend 10 - 5 + +batch size / gpu +Doc stride / Max seq len \ No newline at end of file diff --git a/docs/_src/usage/usage/terms.md b/docs/_src/usage/usage/terms.md index cb98a53c4..d9b7be75d 100644 --- a/docs/_src/usage/usage/terms.md +++ b/docs/_src/usage/usage/terms.md @@ -44,6 +44,7 @@ In question answering models (and hence in Haystack Readers), this is usually a **Question Answering (QA)** - A popular task in the world of NLP where systems have to find answers to questions. The term is generally used to refer to extractive question answering, where a system has to find the minimal text span in a given document that contains the answer to the question. +Note however, that it may also refer to abstractive question answering or FAQ matching. **Reader** - The component in Haystack that does the closest reading of a document to extract the exact text which answers a question.