haystack/tutorials/Tutorial5_Evaluation.py

from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.indexing.utils import fetch_archive_from_http
from haystack.retriever.sparse import ElasticsearchRetriever
from haystack.retriever.dense import DensePassageRetriever
from haystack.reader.farm import FARMReader
from haystack.finder import Finder
from farm.utils import initialize_device_settings

import logging
import subprocess
import time

logger = logging.getLogger(__name__)

##############################################
# Settings
##############################################
LAUNCH_ELASTICSEARCH = True

eval_retriever_only = True
eval_reader_only = False
eval_both = False

# make sure these indices do not collide with existing ones, the indices will be wiped clean before data is inserted
doc_index = "tutorial5_docs"
label_index = "tutorial5_labels"
##############################################
# Code
##############################################
device, n_gpu = initialize_device_settings(use_cuda=True)
# Start an Elasticsearch server
# You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in
# your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.
if LAUNCH_ELASTICSEARCH:
    logging.info("Starting Elasticsearch ...")
    status = subprocess.run(
        ['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
    )
    if status.returncode:
        raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
                        "then set LAUNCH_ELASTICSEARCH in the script to False.")
    time.sleep(30)

# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents
doc_dir = "../data/nq"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document",
                                            create_index=False, embedding_field="emb",
                                            embedding_dim=768, excluded_meta_data=["emb"])


# Add evaluation data to Elasticsearch database
# We first delete the custom tutorial indices to not have duplicate elements
document_store.delete_all_documents(index=doc_index)
document_store.delete_all_documents(index=label_index)
document_store.add_eval_data(filename="../data/nq/nq_dev_subset_v2.json", doc_index=doc_index, label_index=label_index)

# Initialize Retriever
retriever = ElasticsearchRetriever(document_store=document_store)

# Alternative: Evaluate DensePassageRetriever
# Note, that DPR works best when you index short passages < 512 tokens as only those tokens will be used for the embedding.
# Here, for nq_dev_subset_v2.json we have avg. num of tokens = 5220(!).
# DPR still outperforms Elastic's BM25 by a small margin here.
# retriever = DensePassageRetriever(document_store=document_store,
#                                   query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
#                                   passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
#                                   use_gpu=True,
#                                   embed_title=True,
#                                   remove_sep_tok_from_untitled_passages=True)
# document_store.update_embeddings(retriever, index=doc_index)


# Initialize Reader
reader = FARMReader("deepset/roberta-base-squad2", top_k_per_candidate=4)

# Initialize Finder which sticks together Reader and Retriever
finder = Finder(reader, retriever)


## Evaluate Retriever on its own
if eval_retriever_only:
    retriever_eval_results = retriever.eval(top_k=10, label_index=label_index, doc_index=doc_index)
    ## Retriever Recall is the proportion of questions for which the correct document containing the answer is
    ## among the correct documents
    print("Retriever Recall:", retriever_eval_results["recall"])
    ## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
    print("Retriever Mean Avg Precision:", retriever_eval_results["map"])

# Evaluate Reader on its own
if eval_reader_only:
    reader_eval_results = reader.eval(document_store=document_store, device=device, label_index=label_index, doc_index=doc_index)
    # Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
    #reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json", device=device)

    ## Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer
    print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"])
    ## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
    print("Reader Exact Match:", reader_eval_results["EM"])
    ## Reader F1-Score is the average overlap between the predicted answers and the correct answers
    print("Reader F1-Score:", reader_eval_results["f1"])


# Evaluate combination of Reader and Retriever through Finder
if eval_both:
    finder_eval_results = finder.eval(top_k_retriever=1, top_k_reader=10, label_index=label_index, doc_index=doc_index)
    finder.print_eval_results(finder_eval_results)
Add Evaluation of Reader, Retriever and Finder (#92) 2020-05-29 15:57:07 +02:00			`from haystack.database.elasticsearch import ElasticsearchDocumentStore`
Add PDF text extraction (#109) 2020-06-08 11:07:19 +02:00			`from haystack.indexing.utils import fetch_archive_from_http`
Dense Passage Retriever (Inference) (#167) 2020-06-30 19:05:45 +02:00			`from haystack.retriever.sparse import ElasticsearchRetriever`
Refactor DPR from FB to Transformers codebase (#308) * change_HFBertEncoder to transformers DPREncoder * Removed BertTensorizer * model download relative path * Refactor model load * Tutorial5 DPR updated * fix print_eval_results typo * copy transformers DPR modules in dpr_utils and test * transformer v3.0.2 import errors fixed * remove dependency of DPRConfig on attribute use_return_tuple * Adjust transformers 302 locally to work with dpr * projection layer removed from DPR encoders * fixed mypy errors * transformers DPR compatible code added * transformers DPR compatibility added * bug fix in tutorial 6 notebook * Docstring update and variable naming issues fix * tutorial modified to reflect DPR variable naming change * title addition to passage use-cases handled * modified handling untitled batch * resolved mypy errors * typos in docstrings and comments fixed * cleaned DPR code and added new test cases * warnings added for non-bert model [SEP] token removal * changed warning to logger warning * title mask creation refactored * bug fix on cuda issues * tutorial 6 instantiates modified DPR * tutorial 5 modified * tutorial 5 ipython notebook modified: DPR instantiation * batch_size added to DPR instantiation * tutorial 5 jupyter notebook typos fixed * improved docstrings, fixed typos * Update docstring Co-authored-by: Timo Moeller <timo.moeller@deepset.ai> Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai> 2020-08-25 20:16:00 +05:30			`from haystack.retriever.dense import DensePassageRetriever`
Add Evaluation of Reader, Retriever and Finder (#92) 2020-05-29 15:57:07 +02:00			`from haystack.reader.farm import FARMReader`
			`from haystack.finder import Finder`
			`from farm.utils import initialize_device_settings`

			`import logging`
			`import subprocess`
			`import time`

Fix document id missing in farm inference output (#174) 2020-06-26 11:01:10 +02:00			`logger = logging.getLogger(__name__)`

			`##############################################`
			`# Settings`
			`##############################################`
			`LAUNCH_ELASTICSEARCH = True`

Add eval for Dense Passage Retriever & Refactor handling of labels/feedback (#243) 2020-07-31 11:34:06 +02:00			`eval_retriever_only = True`
Fix document id missing in farm inference output (#174) 2020-06-26 11:01:10 +02:00			`eval_reader_only = False`
Add eval for Dense Passage Retriever & Refactor handling of labels/feedback (#243) 2020-07-31 11:34:06 +02:00			`eval_both = False`
Add Evaluation of Reader, Retriever and Finder (#92) 2020-05-29 15:57:07 +02:00
Aggregate label objects for same questions (#292) * Add aggregate labels obj, use in retriever eval function * Change launch ES param * Move aggregation from ES document store to base class * Fix type annotations 2020-08-07 11:24:41 +02:00			`# make sure these indices do not collide with existing ones, the indices will be wiped clean before data is inserted`
			`doc_index = "tutorial5_docs"`
			`label_index = "tutorial5_labels"`
Fix document id missing in farm inference output (#174) 2020-06-26 11:01:10 +02:00			`##############################################`
			`# Code`
			`##############################################`
			`device, n_gpu = initialize_device_settings(use_cuda=True)`
Add Evaluation of Reader, Retriever and Finder (#92) 2020-05-29 15:57:07 +02:00			`# Start an Elasticsearch server`
			`# You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in`
			`# your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.`
			`if LAUNCH_ELASTICSEARCH:`
			`logging.info("Starting Elasticsearch ...")`
			`status = subprocess.run(`
			`['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True`
			`)`
			`if status.returncode:`
			`raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"`
			`"then set LAUNCH_ELASTICSEARCH in the script to False.")`
			`time.sleep(30)`

			`# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents`
			`doc_dir = "../data/nq"`
update eval dataset 2020-07-15 16:14:52 +02:00			`s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"`
Add Evaluation of Reader, Retriever and Finder (#92) 2020-05-29 15:57:07 +02:00			`fetch_archive_from_http(url=s3_url, output_dir=doc_dir)`

			`# Connect to Elasticsearch`
Add eval for Dense Passage Retriever & Refactor handling of labels/feedback (#243) 2020-07-31 11:34:06 +02:00			`document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document",`
			`create_index=False, embedding_field="emb",`
			`embedding_dim=768, excluded_meta_data=["emb"])`

Aggregate label objects for same questions (#292) * Add aggregate labels obj, use in retriever eval function * Change launch ES param * Move aggregation from ES document store to base class * Fix type annotations 2020-08-07 11:24:41 +02:00
Add Evaluation of Reader, Retriever and Finder (#92) 2020-05-29 15:57:07 +02:00			`# Add evaluation data to Elasticsearch database`
Aggregate label objects for same questions (#292) * Add aggregate labels obj, use in retriever eval function * Change launch ES param * Move aggregation from ES document store to base class * Fix type annotations 2020-08-07 11:24:41 +02:00			`# We first delete the custom tutorial indices to not have duplicate elements`
			`document_store.delete_all_documents(index=doc_index)`
			`document_store.delete_all_documents(index=label_index)`
			`document_store.add_eval_data(filename="../data/nq/nq_dev_subset_v2.json", doc_index=doc_index, label_index=label_index)`
Add Evaluation of Reader, Retriever and Finder (#92) 2020-05-29 15:57:07 +02:00
			`# Initialize Retriever`
			`retriever = ElasticsearchRetriever(document_store=document_store)`

Add eval for Dense Passage Retriever & Refactor handling of labels/feedback (#243) 2020-07-31 11:34:06 +02:00			`# Alternative: Evaluate DensePassageRetriever`
			`# Note, that DPR works best when you index short passages < 512 tokens as only those tokens will be used for the embedding.`
			`# Here, for nq_dev_subset_v2.json we have avg. num of tokens = 5220(!).`
			`# DPR still outperforms Elastic's BM25 by a small margin here.`
Refactor DPR from FB to Transformers codebase (#308) * change_HFBertEncoder to transformers DPREncoder * Removed BertTensorizer * model download relative path * Refactor model load * Tutorial5 DPR updated * fix print_eval_results typo * copy transformers DPR modules in dpr_utils and test * transformer v3.0.2 import errors fixed * remove dependency of DPRConfig on attribute use_return_tuple * Adjust transformers 302 locally to work with dpr * projection layer removed from DPR encoders * fixed mypy errors * transformers DPR compatible code added * transformers DPR compatibility added * bug fix in tutorial 6 notebook * Docstring update and variable naming issues fix * tutorial modified to reflect DPR variable naming change * title addition to passage use-cases handled * modified handling untitled batch * resolved mypy errors * typos in docstrings and comments fixed * cleaned DPR code and added new test cases * warnings added for non-bert model [SEP] token removal * changed warning to logger warning * title mask creation refactored * bug fix on cuda issues * tutorial 6 instantiates modified DPR * tutorial 5 modified * tutorial 5 ipython notebook modified: DPR instantiation * batch_size added to DPR instantiation * tutorial 5 jupyter notebook typos fixed * improved docstrings, fixed typos * Update docstring Co-authored-by: Timo Moeller <timo.moeller@deepset.ai> Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai> 2020-08-25 20:16:00 +05:30			`# retriever = DensePassageRetriever(document_store=document_store,`
			`# query_embedding_model="facebook/dpr-question_encoder-single-nq-base",`
			`# passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",`
			`# use_gpu=True,`
			`# embed_title=True,`
			`# remove_sep_tok_from_untitled_passages=True)`
			`# document_store.update_embeddings(retriever, index=doc_index)`
Add eval for Dense Passage Retriever & Refactor handling of labels/feedback (#243) 2020-07-31 11:34:06 +02:00

Add Evaluation of Reader, Retriever and Finder (#92) 2020-05-29 15:57:07 +02:00			`# Initialize Reader`
Set top_k_per_candidate 2020-08-26 12:03:56 +02:00			`reader = FARMReader("deepset/roberta-base-squad2", top_k_per_candidate=4)`
Add Evaluation of Reader, Retriever and Finder (#92) 2020-05-29 15:57:07 +02:00
			`# Initialize Finder which sticks together Reader and Retriever`
			`finder = Finder(reader, retriever)`

Fix document id missing in farm inference output (#174) 2020-06-26 11:01:10 +02:00
			`## Evaluate Retriever on its own`
			`if eval_retriever_only:`
Change to retriever eval top_k to match notebook 2020-08-18 11:39:49 +02:00			`retriever_eval_results = retriever.eval(top_k=10, label_index=label_index, doc_index=doc_index)`
Fix document id missing in farm inference output (#174) 2020-06-26 11:01:10 +02:00			`## Retriever Recall is the proportion of questions for which the correct document containing the answer is`
			`## among the correct documents`
			`print("Retriever Recall:", retriever_eval_results["recall"])`
			`## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank`
			`print("Retriever Mean Avg Precision:", retriever_eval_results["map"])`
Add Evaluation of Reader, Retriever and Finder (#92) 2020-05-29 15:57:07 +02:00
			`# Evaluate Reader on its own`
Fix document id missing in farm inference output (#174) 2020-06-26 11:01:10 +02:00			`if eval_reader_only:`
Aggregate label objects for same questions (#292) * Add aggregate labels obj, use in retriever eval function * Change launch ES param * Move aggregation from ES document store to base class * Fix type annotations 2020-08-07 11:24:41 +02:00			`reader_eval_results = reader.eval(document_store=document_store, device=device, label_index=label_index, doc_index=doc_index)`
Fix document id missing in farm inference output (#174) 2020-06-26 11:01:10 +02:00			`# Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch`
One more update 2020-07-15 16:24:10 +02:00			`#reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json", device=device)`
Add Evaluation of Reader, Retriever and Finder (#92) 2020-05-29 15:57:07 +02:00
Upgrade to new FARM / Transformers / PyTorch versions (#212) 2020-07-14 18:53:15 +02:00			`## Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer`
			`print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"])`
Fix document id missing in farm inference output (#174) 2020-06-26 11:01:10 +02:00			`## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer`
			`print("Reader Exact Match:", reader_eval_results["EM"])`
			`## Reader F1-Score is the average overlap between the predicted answers and the correct answers`
			`print("Reader F1-Score:", reader_eval_results["f1"])`
Add Evaluation of Reader, Retriever and Finder (#92) 2020-05-29 15:57:07 +02:00

			`# Evaluate combination of Reader and Retriever through Finder`
Fix document id missing in farm inference output (#174) 2020-06-26 11:01:10 +02:00			`if eval_both:`
Aggregate label objects for same questions (#292) * Add aggregate labels obj, use in retriever eval function * Change launch ES param * Move aggregation from ES document store to base class * Fix type annotations 2020-08-07 11:24:41 +02:00			`finder_eval_results = finder.eval(top_k_retriever=1, top_k_reader=10, label_index=label_index, doc_index=doc_index)`
Fix document id missing in farm inference output (#174) 2020-06-26 11:01:10 +02:00			`finder.print_eval_results(finder_eval_results)`