From 1e8af84ecc7f59afffbcb1a28e3779031cc81987 Mon Sep 17 00:00:00 2001 From: Branden Chan <33759007+brandenchan@users.noreply.github.com> Date: Thu, 19 Nov 2020 14:58:27 +0100 Subject: [PATCH] Make more changes to documentation (#578) * First batch of changes * Add RAG tutorial links * Prettify RAG tutorial * draft of generator doc * Add text * Complete generator page * Create optimization section * Split intro * Fix formatting tutorial 7 --- README.md | 5 + docs/_src/tutorials/tutorials/1.md | 6 +- docs/_src/tutorials/tutorials/2.md | 6 +- docs/_src/tutorials/tutorials/3.md | 6 +- docs/_src/tutorials/tutorials/4.md | 19 +- docs/_src/tutorials/tutorials/5.md | 14 +- docs/_src/tutorials/tutorials/6.md | 13 +- docs/_src/tutorials/tutorials/7.md | 174 +++++++++++++++++ docs/_src/tutorials/tutorials/headers.py | 9 +- docs/_src/usage/usage/document_store.md | 2 +- docs/_src/usage/usage/generator.md | 29 +++ docs/_src/usage/usage/intro.md | 60 ------ docs/_src/usage/usage/optimization.md | 58 ++++++ docs/_src/usage/usage/preprocessing.md | 28 +-- docs/_src/usage/usage/retriever.md | 2 + docs/_src/usage/usage/terms.md | 1 + docs/_src/usage/usage/use_cases.md | 69 +++++++ tutorials/Tutorial7_RAG_Generator.ipynb | 232 +++++++++++++++++------ 18 files changed, 563 insertions(+), 170 deletions(-) create mode 100644 docs/_src/tutorials/tutorials/7.md create mode 100644 docs/_src/usage/usage/generator.md create mode 100644 docs/_src/usage/usage/optimization.md create mode 100644 docs/_src/usage/usage/use_cases.md diff --git a/README.md b/README.md index 1ae0fbc48..d247579af 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,11 @@ We recommend Elasticsearch or FAISS, but have also more light-weight options for [Jupyter noteboook](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb) or [Colab](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial6_Better_Retrieval_via_DPR.ipynb) +- Tutorial 7 - Generative QA via "Retrieval-Augmented Generation": + [Jupyter noteboook](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial7_RAG_Generator.ipynb) + or + [Colab](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial7_RAG_Generator.ipynb) + ## Quick Tour [File Conversion](https://github.com/deepset-ai/haystack/blob/master/README.md#1-file-conversion) | [Preprocessing](https://github.com/deepset-ai/haystack/blob/master/README.md#2-preprocessing) | [DocumentStores](https://github.com/deepset-ai/haystack/blob/master/README.md#3-documentstores) | [Retrievers](https://github.com/deepset-ai/haystack/blob/master/README.md#5-retrievers) | [Readers](https://github.com/deepset-ai/haystack/blob/master/README.md#5-readers) | [REST API](https://github.com/deepset-ai/haystack/blob/master/README.md#6-rest-api) | [Labeling Tool](https://github.com/deepset-ai/haystack/blob/master/README.md#7-labeling-tool) diff --git a/docs/_src/tutorials/tutorials/1.md b/docs/_src/tutorials/tutorials/1.md index 92faee639..d7ffdb814 100644 --- a/docs/_src/tutorials/tutorials/1.md +++ b/docs/_src/tutorials/tutorials/1.md @@ -28,9 +28,11 @@ marvellous seven kingdoms... # Install the latest release of Haystack in your own environment #! pip install farm-haystack -# Install the latest master of Haystack and install the version of torch that works with the colab GPUs +# Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + ``` diff --git a/docs/_src/tutorials/tutorials/2.md b/docs/_src/tutorials/tutorials/2.md index 89c053c3f..c3bc0433b 100644 --- a/docs/_src/tutorials/tutorials/2.md +++ b/docs/_src/tutorials/tutorials/2.md @@ -22,9 +22,11 @@ This tutorial shows you how to fine-tune a pretrained model on your own dataset. # Install the latest release of Haystack in your own environment #! pip install farm-haystack -# Install the latest master of Haystack and install the version of torch that works with the colab GPUs +# Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + ``` diff --git a/docs/_src/tutorials/tutorials/3.md b/docs/_src/tutorials/tutorials/3.md index 2a42ae103..ebd353bae 100644 --- a/docs/_src/tutorials/tutorials/3.md +++ b/docs/_src/tutorials/tutorials/3.md @@ -22,9 +22,11 @@ If you are interested in more feature-rich Elasticsearch, then please refer to t # Install the latest release of Haystack in your own environment #! pip install farm-haystack -# Install the latest master of Haystack and install the version of torch that works with the colab GPUs +# Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + ``` diff --git a/docs/_src/tutorials/tutorials/4.md b/docs/_src/tutorials/tutorials/4.md index 28f9025d6..9a8dff48f 100644 --- a/docs/_src/tutorials/tutorials/4.md +++ b/docs/_src/tutorials/tutorials/4.md @@ -31,9 +31,11 @@ In some use cases, a combination of extractive QA and FAQ-style can also be an i # Install the latest release of Haystack in your own environment #! pip install farm-haystack -# Install the latest master of Haystack and install the version of torch that works with the colab GPUs +# Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + ``` @@ -54,19 +56,19 @@ You can start Elasticsearch on your local machine instance using Docker. If Dock ```python # Recommended: Start Elasticsearch using Docker -# ! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2 +# ! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2 ``` ```python # In Colab / No Docker environments: Start Elasticsearch from source -! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q -! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz -! chown -R daemon:daemon elasticsearch-7.6.2 +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 import os from subprocess import Popen, PIPE, STDOUT -es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'], +es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon ) @@ -98,7 +100,7 @@ We can use the `EmbeddingRetriever` for this purpose and specify a model that we ```python -retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=False) +retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=True) ``` ### Prepare & Index FAQ data @@ -121,7 +123,6 @@ print(df.head()) # Get embeddings for our questions from the FAQs questions = list(df["question"].values) df["question_emb"] = retriever.embed_queries(texts=questions) -df["question_emb"] = df["question_emb"].apply(list) # convert from numpy to list for ES ingestion df = df.rename(columns={"answer": "text"}) # Convert Dataframe to list of dicts and index them in our DocumentStore diff --git a/docs/_src/tutorials/tutorials/5.md b/docs/_src/tutorials/tutorials/5.md index 85475f202..1d6fe4274 100644 --- a/docs/_src/tutorials/tutorials/5.md +++ b/docs/_src/tutorials/tutorials/5.md @@ -21,21 +21,23 @@ You can start Elasticsearch on your local machine instance using Docker. If Dock # Install the latest release of Haystack in your own environment #! pip install farm-haystack -# Install the latest master of Haystack and install the version of torch that works with the colab GPUs +# Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + ``` ```python # In Colab / No Docker environments: Start Elasticsearch from source -! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q -! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz -! chown -R daemon:daemon elasticsearch-7.6.2 +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 import os from subprocess import Popen, PIPE, STDOUT -es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'], +es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon ) diff --git a/docs/_src/tutorials/tutorials/6.md b/docs/_src/tutorials/tutorials/6.md index ec426daad..9de9b0c3d 100644 --- a/docs/_src/tutorials/tutorials/6.md +++ b/docs/_src/tutorials/tutorials/6.md @@ -77,9 +77,11 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial # Install the latest release of Haystack in your own environment #! pip install farm-haystack -# Install the latest master of Haystack and install the version of torch that works with the colab GPUs +# Install the latest master of Haystack !pip install git+https://github.com/deepset-ai/haystack.git -!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + ``` @@ -142,11 +144,12 @@ from haystack.retriever.dense import DensePassageRetriever retriever = DensePassageRetriever(document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", + max_seq_len_query=64, + max_seq_len_passage=256, + batch_size=16, use_gpu=True, embed_title=True, - max_seq_len=256, - batch_size=16, - remove_sep_tok_from_untitled_passages=True) + use_fast_tokenizers=True) # Important: # Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all # previously indexed documents and update their embedding representation. diff --git a/docs/_src/tutorials/tutorials/7.md b/docs/_src/tutorials/tutorials/7.md new file mode 100644 index 000000000..c89df7ab5 --- /dev/null +++ b/docs/_src/tutorials/tutorials/7.md @@ -0,0 +1,174 @@ + + +# Generative QA with "Retrieval-Augmented Generation" + +While extractive QA highlights the span of text that answers a query, +generative QA can return a novel text answer that it has composed. +In this tutorial, you will learn how to set up a generative system using the +[RAG model](https://arxiv.org/abs/2005.11401) which conditions the +answer generator on a set of retrieved documents. + +Here are the packages and imports that we'll need: + + +```python +!pip install git+https://github.com/deepset-ai/haystack.git +!pip install urllib3==1.25.4 +!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html + +``` + + +```python +from typing import List +import requests +import pandas as pd +from haystack import Document +from haystack.document_store.faiss import FAISSDocumentStore +from haystack.generator.transformers import RAGenerator +from haystack.retriever.dense import DensePassageRetriever +``` + +Let's download a csv containing some sample text and preprocess the data. + + + +```python +# Download sample +temp = requests.get("https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv") +open('small_generator_dataset.csv', 'wb').write(temp.content) + +# Create dataframe with columns "title" and "text" +df = pd.read_csv("small_generator_dataset.csv", sep=',') +# Minimal cleaning +df.fillna(value="", inplace=True) + +print(df.head()) +``` + +We can cast our data into Haystack Document objects. +Alternatively, we can also just use dictionaries with "text" and "meta" fields + + +```python +# Use data to initialize Document objects +titles = list(df["title"].values) +texts = list(df["text"].values) +documents: List[Document] = [] +for title, text in zip(titles, texts): + documents.append( + Document( + text=text, + meta={ + "name": title or "" + } + ) + ) +``` + +Here we initialize the FAISSDocumentStore, DensePassageRetriever and RAGenerator. +FAISS is chosen here since it is optimized vector storage. + + +```python +# Initialize FAISS document store. +# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding +document_store = FAISSDocumentStore( + faiss_index_factory_str="Flat", + return_embedding=True +) + +# Initialize DPR Retriever to encode documents, encode question and query documents +retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="facebook/dpr-question_encoder-single-nq-base", + passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", + use_gpu=False, + embed_title=True, +) + +# Initialize RAG Generator +generator = RAGenerator( + model_name_or_path="facebook/rag-token-nq", + use_gpu=False, + top_k_answers=1, + max_length=200, + min_length=2, + embed_title=True, + num_beams=2, +) +``` + +We write documents to the DocumentStore, first by deleting any remaining documents then calling `write_documents()`. +The `update_embeddings()` method uses the retriever to create an embedding for each document. + + + +```python +# Delete existing documents in documents store +document_store.delete_all_documents() + +# Write documents to document store +document_store.write_documents(documents) + +# Add documents embeddings to index +document_store.update_embeddings( + retriever=retriever +) +``` + +Here are our questions: + + +```python +QUESTIONS = [ + "who got the first nobel prize in physics", + "when is the next deadpool movie being released", + "which mode is used for short wave broadcast service", + "who is the owner of reading football club", + "when is the next scandal episode coming out", + "when is the last time the philadelphia won the superbowl", + "what is the most current adobe flash player version", + "how many episodes are there in dragon ball z", + "what is the first step in the evolution of the eye", + "where is gall bladder situated in human body", + "what is the main mineral in lithium batteries", + "who is the president of usa right now", + "where do the greasers live in the outsiders", + "panda is a national animal of which country", + "what is the name of manchester united stadium", +] +``` + +Now let's run our system! +The retriever will pick out a small subset of documents that it finds relevant. +These are used to condition the generator as it generates the answer. +What it should return then are novel text spans that form and answer to your question! + + +```python +# Now generate an answer for each question +for question in QUESTIONS: + # Retrieve related documents from retriever + retriever_results = retriever.retrieve( + query=question + ) + + # Now generate answer from question and retrieved documents + predicted_result = generator.predict( + question=question, + documents=retriever_results, + top_k=1 + ) + + # Print you answer + answers = predicted_result["answers"] + print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'') +``` diff --git a/docs/_src/tutorials/tutorials/headers.py b/docs/_src/tutorials/tutorials/headers.py index b09e2a4fc..a7cb8c221 100644 --- a/docs/_src/tutorials/tutorials/headers.py +++ b/docs/_src/tutorials/tutorials/headers.py @@ -46,6 +46,13 @@ metaDescription: "" slug: "/docs/tutorial6" date: "2020-09-03" id: "tutorial6md" +--->""", + 7: """""" - } diff --git a/docs/_src/usage/usage/document_store.md b/docs/_src/usage/usage/document_store.md index 1827ca1c5..ff182e123 100644 --- a/docs/_src/usage/usage/document_store.md +++ b/docs/_src/usage/usage/document_store.md @@ -79,7 +79,7 @@ See API documentation for more info. DocumentStores expect Documents in dictionary form, like that below. They are loaded using the `DocumentStore.write_documents()` method. -See [Preprocessing](/docs/latest/preprocessingmd) for more information on how to best prepare your data. +See [Preprocessing](/docs/latest/preprocessingmd) for more information on the cleaning and splitting steps that will help you maximize Haystack's performance. [//]: # (Add link to preprocessing section) diff --git a/docs/_src/usage/usage/generator.md b/docs/_src/usage/usage/generator.md new file mode 100644 index 000000000..9b04b34a2 --- /dev/null +++ b/docs/_src/usage/usage/generator.md @@ -0,0 +1,29 @@ + + +# Generator + +See [Tutorial 7](/docs/latest/tutorial7md) for a guide on how to build your own generative QA system. + +While extractive QA highlights the span of text that answers a query, +generative QA can return a novel text answer that it has composed. +The best current approaches, such as [Retriever-Augmented Generation](https://arxiv.org/abs/2005.11401), +can draw upon both the knowledge it gained during language model pretraining (parametric memory) +as well as passages provided to it with a retriever (non-parametric memory). +With the advent of Transformer based retrieval methods such as [Dense Passage Retrieval](https://arxiv.org/abs/2004.04906), +retriever and generator can be trained concurrently from the one loss signal. + +Pros +* More appropriately phrased answers +* Able to syntehsize information from different texts +* Can draw on latent knowledge stored in language model + +Cons +* Not easy to track what piece of information the generator is basing its response off of + diff --git a/docs/_src/usage/usage/intro.md b/docs/_src/usage/usage/intro.md index 28917abb4..d20372c13 100644 --- a/docs/_src/usage/usage/intro.md +++ b/docs/_src/usage/usage/intro.md @@ -24,66 +24,6 @@ and Haystack is designed to be the bridge between research and industry. * **Domain adaptation**: Fine-tune models to your own domain & improve them continuously via user feedback -## Use cases - -### Semantic Search System - -Take the leap from using keyword search on your own documents to semantic search with Haystack. - - -* Store your documents in the database of your choice (Elasticsearch, SQL, in memory, FAISS) - - -* Perform question driven queries. - -Expect to see results that highlight the very sentence that contains the answer to your question. -Thanks to the power of Transformer based language models, results are chosen based on compatibility in meaning -rather than lexical overlap. - - - -![image](../../img/search.png) - -### Information Extractor - -Automate the extraction of relevant information from a set of documents that pertain to the same topics but for different entities. - -Haystack can: - - -* Apply a set of standard questions to each document in a store - - -* Return a NO_ANSWER if a given document does not contain the answer to a question - -Say you have the financial reports for different companies over different years. -You can gather a set of standard questions which are applicable to each financial report, -like *what is the revenue forecast for 2020?* or *what are the main sources of income?*. -Haystack will try to find an answer for each question within each document! - -We’ve seen this style of application be particularly effective in the sphere of finance and patent law -but we see a lot of potential in using this to gain a better overview of academic papers and internal business documents. - - -### FAQ Style Question Answering - -Leverage existing FAQ documents and semantic similarity search to answer new incoming questions. -The workflow is as follows: - - -* Store a set of FAQ documents in Haystack - - -* The user presents a new question - - -* Haystack will find the closest match to the new question in the FAQ documents - - -* The user will be presented with the most similar Question Answer pair - -Haystack’s flexibility allows you to give new users more dynamic access to your existing documentation. - ## Technology diff --git a/docs/_src/usage/usage/optimization.md b/docs/_src/usage/usage/optimization.md new file mode 100644 index 000000000..226754b46 --- /dev/null +++ b/docs/_src/usage/usage/optimization.md @@ -0,0 +1,58 @@ + + +# Optimization + +## Document Length + +Document length has a very direct impact on the speed of the Reader +which is why we recommend using the `PreProcessor` class to clean and split your documents. +**If you halve the length of your documents, you will halve the workload placed onto your Reader.** + +For **sparse retrievers**, very long documents pose a challenge since the signal of the relevant section of text +can get washed out by the rest of the document. +We would recommend making sure that **documents are no longer than 10,000 words**. + +**Dense retrievers** are limited in the length of text that they can read in one pass. +As such, it is important that documents are not longer than the dense retriever's maximum input length. +By default, Haystack's DensePassageRetriever model has a maximum length of 256 tokens. +As such, we recommend that documents contain significantly less words. +We have found decent performance with **documents around 100 words long**. + +## Respecting Sentence Boundaries + +When splitting documents, it is generally not a good idea to let document boundaries fall in the middle of sentences. +Doing so means that each document will contain incomplete sentence fragments +which maybe be hard for both retriever and reader to interpret. +It is therefore recommended to set `split_respect_sentence_boundary=True` when initializing your `PreProcessor`. + +## Choosing the Right top-k Values + +The `top-k` parameter in both the `Retriever` and `Reader` determine how many results they return. +More specifically, `Retriever` `top-k` dictates how many retrieved documents are passed on to the next stage, +while `Reader` `top-k` determines how many answer candidates to show. + +In our experiments, we have found that **`Retriever` `top_k=10` +gives decent overall performance** and so we have set this as the default in Haystack. + +The choice of `Retriever` `top-k` is a trade-off between speed and accuracy, +especially when there is a `Reader` in the pipeline. +Setting it higher means passing more documents to the `Reader`, +thus reducing the chance that the answer-containing passage is missed. +However, passing more documents to the `Reader` will create a larger workload for the component. + +These parameters can easily be tweaked as follows if using a `Finder`: +``` python +answers = finder.get_answers(retriever_top_k=10, + reader_top_k=5) +``` +or like this if directly calling the `Retriever`: +``` python +retrieved_docs = retriever.retrieve(top_k=10) +``` diff --git a/docs/_src/usage/usage/preprocessing.md b/docs/_src/usage/usage/preprocessing.md index 1faa8be7d..ee49db7ed 100644 --- a/docs/_src/usage/usage/preprocessing.md +++ b/docs/_src/usage/usage/preprocessing.md @@ -90,6 +90,8 @@ it is recommended that they are further processed in order to ensure optimal Ret The `PreProcessor` takes one of the documents created by the converter as input, performs various cleaning steps and splits them into multiple smaller documents. +For suggestions on how best to split your documents, see [Optimization](/docs/latest/optimizationmd) + ```python doc = converter.convert(file_path=file, meta=None) processor = PreProcessor(clean_empty_lines=True, @@ -107,29 +109,3 @@ docs = processor.process(d) * `split_by` determines what unit the document is split by: `'word'`, `'sentence'` or `'passage'` * `split_length` sets a maximum number of `'word'`, `'sentence'` or `'passage'` units per output document * `split_respect_sentence_boundary` ensures that document boundaries do not fall in the middle of sentences - -## Impact of Document Splitting - -The File Converters will treat each file as a single document regardless of length. -This is not always ideal as long documents can have negative impacts on both speed and accuracy. - -Document length has a very direct impact on the speed of the Reader. -**If you halve the length of your documents, you can expect that the Reader will double in speed.** - -It is generally not a good idea to let document boundaries fall in the middle of sentences. -Doing so means that each document will contain incomplete sentence fragments -which maybe be hard for both retriever and reader to interpret. - -For **sparse retrievers**, very long documents pose a challenge since the signal of the relevant section of text -can get washed out by the rest of the document. -We would recommend making sure that **documents are no longer than 10,000 words**. - -**Dense retrievers** are limited in the length of text that they can read in one pass. -As such, it is important that documents are not longer than the dense retriever's maximum input length. -By default, Haystack's DensePassageRetriever model has a maximum length of 256 tokens. -As such, we recommend that documents contain significantly less words. -We have found decent performance with **documents around 100 words long**. - - - - diff --git a/docs/_src/usage/usage/retriever.md b/docs/_src/usage/usage/retriever.md index 363095e0a..b4bf42cff 100644 --- a/docs/_src/usage/usage/retriever.md +++ b/docs/_src/usage/usage/retriever.md @@ -32,6 +32,8 @@ Here are the combinations which are supported: | Embedding | Y | Y | N | Y | | DPR | Y | Y | N | Y | +See [Optimization](/docs/latest/optimizationmd) for suggestions on how to choose top-k values. + ## TF-IDF ### Description diff --git a/docs/_src/usage/usage/terms.md b/docs/_src/usage/usage/terms.md index cb98a53c4..d9b7be75d 100644 --- a/docs/_src/usage/usage/terms.md +++ b/docs/_src/usage/usage/terms.md @@ -44,6 +44,7 @@ In question answering models (and hence in Haystack Readers), this is usually a **Question Answering (QA)** - A popular task in the world of NLP where systems have to find answers to questions. The term is generally used to refer to extractive question answering, where a system has to find the minimal text span in a given document that contains the answer to the question. +Note however, that it may also refer to abstractive question answering or FAQ matching. **Reader** - The component in Haystack that does the closest reading of a document to extract the exact text which answers a question. diff --git a/docs/_src/usage/usage/use_cases.md b/docs/_src/usage/usage/use_cases.md new file mode 100644 index 000000000..49311328c --- /dev/null +++ b/docs/_src/usage/usage/use_cases.md @@ -0,0 +1,69 @@ + + + +# Use cases + +## Semantic Search System + +Take the leap from using keyword search on your own documents to semantic search with Haystack. + + +* Store your documents in the database of your choice (Elasticsearch, SQL, in memory, FAISS) + + +* Perform question driven queries. + +Expect to see results that highlight the very sentence that contains the answer to your question. +Thanks to the power of Transformer based language models, results are chosen based on compatibility in meaning +rather than lexical overlap. + + + +![image](../../img/search.png) + +## Information Extractor + +Automate the extraction of relevant information from a set of documents that pertain to the same topics but for different entities. + +Haystack can: + + +* Apply a set of standard questions to each document in a store + + +* Return a NO_ANSWER if a given document does not contain the answer to a question + +Say you have the financial reports for different companies over different years. +You can gather a set of standard questions which are applicable to each financial report, +like *what is the revenue forecast for 2020?* or *what are the main sources of income?*. +Haystack will try to find an answer for each question within each document! + +We’ve seen this style of application be particularly effective in the sphere of finance and patent law +but we see a lot of potential in using this to gain a better overview of academic papers and internal business documents. + + +## FAQ Style Question Answering + +Leverage existing FAQ documents and semantic similarity search to answer new incoming questions. +The workflow is as follows: + + +* Store a set of FAQ documents in Haystack + + +* The user presents a new question + + +* Haystack will find the closest match to the new question in the FAQ documents + + +* The user will be presented with the most similar Question Answer pair + +Haystack’s flexibility allows you to give new users more dynamic access to your existing documentation. diff --git a/tutorials/Tutorial7_RAG_Generator.ipynb b/tutorials/Tutorial7_RAG_Generator.ipynb index 80a92c757..67167d26a 100644 --- a/tutorials/Tutorial7_RAG_Generator.ipynb +++ b/tutorials/Tutorial7_RAG_Generator.ipynb @@ -1,23 +1,49 @@ { "cells": [ { - "cell_type": "code", + "cell_type": "markdown", + "source": [ + "# Generative QA with \"Retrieval-Augmented Generation\"\n", + "\n", + "While extractive QA highlights the span of text that answers a query,\n", + "generative QA can return a novel text answer that it has composed.\n", + "In this tutorial, you will learn how to set up a generative system using the\n", + "[RAG model](https://arxiv.org/abs/2005.11401) which conditions the\n", + "answer generator on a set of retrieved documents." + ], "metadata": { - "id": "iDyfhfyp7Sjh" - }, + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Here are the packages and imports that we'll need:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], "source": [ "!pip install git+https://github.com/deepset-ai/haystack.git\n", "!pip install urllib3==1.25.4\n", "!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html\n" ], - "execution_count": null, - "outputs": [] + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } }, { "cell_type": "code", - "metadata": { - "id": "ICZanGLa7khF" - }, + "execution_count": null, + "outputs": [], "source": [ "from typing import List\n", "import requests\n", @@ -27,32 +53,63 @@ "from haystack.generator.transformers import RAGenerator\n", "from haystack.retriever.dense import DensePassageRetriever" ], - "execution_count": null, - "outputs": [] + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Let's download a csv containing some sample text and preprocess the data.\n" + ], + "metadata": { + "collapsed": false + } }, { "cell_type": "code", - "metadata": { - "id": "D3f-CQ4c7lEN" - }, + "execution_count": null, + "outputs": [], "source": [ - "# Add documents from which you want generate answers\n", - "# Download a csv containing some sample documents data\n", - "# Here some sample documents data\n", + "# Download sample\n", "temp = requests.get(\"https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv\")\n", "open('small_generator_dataset.csv', 'wb').write(temp.content)\n", "\n", - "# Get dataframe with columns \"title\", and \"text\"\n", + "# Create dataframe with columns \"title\" and \"text\"\n", "df = pd.read_csv(\"small_generator_dataset.csv\", sep=',')\n", "# Minimal cleaning\n", "df.fillna(value=\"\", inplace=True)\n", "\n", - "print(df.head())\n", - "\n", - "# Create to haystack document format\n", + "print(df.head())" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "We can cast our data into Haystack Document objects.\n", + "Alternatively, we can also just use dictionaries with \"text\" and \"meta\" fields" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# Use data to initialize Document objects\n", "titles = list(df[\"title\"].values)\n", "texts = list(df[\"text\"].values)\n", - "\n", "documents: List[Document] = []\n", "for title, text in zip(titles, texts):\n", " documents.append(\n", @@ -64,16 +121,29 @@ " )\n", " )" ], - "execution_count": null, - "outputs": [] + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Here we initialize the FAISSDocumentStore, DensePassageRetriever and RAGenerator.\n", + "FAISS is chosen here since it is optimized vector storage." + ], + "metadata": { + "collapsed": false + } }, { "cell_type": "code", - "metadata": { - "id": "upRu3ebX7nr_" - }, + "execution_count": null, + "outputs": [], "source": [ - "# Initialize FAISS document store to documents and corresponding index for embeddings\n", + "# Initialize FAISS document store.\n", "# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding\n", "document_store = FAISSDocumentStore(\n", " faiss_index_factory_str=\"Flat\",\n", @@ -100,37 +170,60 @@ " num_beams=2,\n", ")" ], - "execution_count": null, - "outputs": [] + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "We write documents to the DocumentStore, first by deleting any remaining documents then calling `write_documents()`.\n", + "The `update_embeddings()` method uses the retriever to create an embedding for each document.\n" + ], + "metadata": { + "collapsed": false + } }, { "cell_type": "code", - "metadata": { - "id": "as8j7hkW7rOW" - }, + "execution_count": null, + "outputs": [], "source": [ "# Delete existing documents in documents store\n", "document_store.delete_all_documents()\n", + "\n", "# Write documents to document store\n", "document_store.write_documents(documents)\n", + "\n", "# Add documents embeddings to index\n", "document_store.update_embeddings(\n", " retriever=retriever\n", ")" ], - "execution_count": null, - "outputs": [] + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Here are our questions:" + ], + "metadata": { + "collapsed": false + } }, { "cell_type": "code", - "metadata": { - "id": "j8It45R872vb", - "cellView": "form" - }, + "execution_count": null, + "outputs": [], "source": [ - "#@title\n", - "# Now ask your questions\n", - "# We have some sample questions\n", "QUESTIONS = [\n", " \"who got the first nobel prize in physics\",\n", " \"when is the next deadpool movie being released\",\n", @@ -149,16 +242,31 @@ " \"what is the name of manchester united stadium\",\n", "]" ], - "execution_count": null, - "outputs": [] + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Now let's run our system!\n", + "The retriever will pick out a small subset of documents that it finds relevant.\n", + "These are used to condition the generator as it generates the answer.\n", + "What it should return then are novel text spans that form and answer to your question!" + ], + "metadata": { + "collapsed": false + } }, { "cell_type": "code", - "metadata": { - "id": "xPUHRuTP742h" - }, + "execution_count": null, + "outputs": [], "source": [ - "# Now generate answer for question\n", + "# Now generate an answer for each question\n", "for question in QUESTIONS:\n", " # Retrieve related documents from retriever\n", " retriever_results = retriever.retrieve(\n", @@ -176,19 +284,31 @@ " answers = predicted_result[\"answers\"]\n", " print(f'Generated answer is \\'{answers[0][\"answer\"]}\\' for the question = \\'{question}\\'')" ], - "execution_count": null, - "outputs": [] + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } } ], "metadata": { - "colab": { - "name": "Tutorial7_RAG_Generator.ipynb", - "provenance": [], - "collapsed_sections": [] - }, "kernelspec": { - "name": "python3", - "display_name": "Python 3" + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" } }, "nbformat": 4,