2021-10-25 15:50:23 +02:00
from haystack . document_stores import ElasticsearchDocumentStore
2022-02-03 13:43:18 +01:00
from haystack . nodes import ElasticsearchRetriever , DensePassageRetriever , EmbeddingRetriever , FARMReader , PreProcessor
2021-10-25 15:50:23 +02:00
from haystack . utils import fetch_archive_from_http , launch_es
2021-12-03 11:19:41 +01:00
from haystack . pipelines import ExtractiveQAPipeline , DocumentSearchPipeline
from haystack . schema import Answer , Document , EvaluationResult , Label , MultiLabel , Span
2020-05-29 15:57:07 +02:00
import logging
2021-04-01 17:35:18 +02:00
logger = logging . getLogger ( __name__ )
2020-05-29 15:57:07 +02:00
2021-01-13 18:17:54 +01:00
def tutorial5_evaluation ( ) :
# make sure these indices do not collide with existing ones, the indices will be wiped clean before data is inserted
doc_index = " tutorial5_docs "
label_index = " tutorial5_labels "
2021-04-01 17:35:18 +02:00
2021-01-13 18:17:54 +01:00
##############################################
# Code
##############################################
2021-04-01 17:35:18 +02:00
launch_es ( )
2021-01-13 18:17:54 +01:00
2021-12-03 11:19:41 +01:00
# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents with one question per document and multiple annotated answers
2022-03-21 11:58:51 +01:00
doc_dir = " data/tutorial5 "
2021-01-13 18:17:54 +01:00
s3_url = " https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip "
fetch_archive_from_http ( url = s3_url , output_dir = doc_dir )
# Connect to Elasticsearch
2021-04-01 17:35:18 +02:00
document_store = ElasticsearchDocumentStore (
2022-02-03 13:43:18 +01:00
host = " localhost " ,
username = " " ,
password = " " ,
index = doc_index ,
label_index = label_index ,
embedding_field = " emb " ,
embedding_dim = 768 ,
excluded_meta_data = [ " emb " ] ,
2021-04-01 17:35:18 +02:00
)
2021-01-13 18:17:54 +01:00
# Add evaluation data to Elasticsearch document store
# We first delete the custom tutorial indices to not have duplicate elements
2021-04-01 17:35:18 +02:00
# and also split our documents into shorter passages using the PreProcessor
preprocessor = PreProcessor (
2021-06-02 12:09:03 +02:00
split_by = " word " ,
2021-10-22 16:03:12 +02:00
split_length = 200 ,
2021-04-01 17:35:18 +02:00
split_overlap = 0 ,
split_respect_sentence_boundary = False ,
clean_empty_lines = False ,
2022-02-03 13:43:18 +01:00
clean_whitespace = False ,
2021-04-01 17:35:18 +02:00
)
2021-09-20 14:29:33 +02:00
document_store . delete_documents ( index = doc_index )
document_store . delete_documents ( index = label_index )
2021-12-03 11:19:41 +01:00
# The add_eval_data() method converts the given dataset in json format into Haystack document and label objects.
# Those objects are then indexed in their respective document and label index in the document store.
# The method can be used with any dataset in SQuAD format.
2021-04-01 17:35:18 +02:00
document_store . add_eval_data (
filename = " ../data/nq/nq_dev_subset_v2.json " ,
doc_index = doc_index ,
label_index = label_index ,
2022-02-03 13:43:18 +01:00
preprocessor = preprocessor ,
2021-04-01 17:35:18 +02:00
)
2021-01-13 18:17:54 +01:00
# Initialize Retriever
2022-03-04 11:29:48 +01:00
from haystack . nodes import ElasticsearchRetriever
2021-01-13 18:17:54 +01:00
retriever = ElasticsearchRetriever ( document_store = document_store )
2022-03-04 11:29:48 +01:00
# Alternative: Evaluate dense retrievers (EmbeddingRetriever or DensePassageRetriever)
# The EmbeddingRetriever uses a single transformer based encoder model for query and document.
# In contrast, DensePassageRetriever uses two separate encoders for both.
2021-12-03 11:19:41 +01:00
# Please make sure the "embedding_dim" parameter in the DocumentStore above matches the output dimension of your models!
2021-10-22 16:03:12 +02:00
# Please also take care that the PreProcessor splits your files into chunks that can be completely converted with
# the max_seq_len limitations of Transformers
2022-03-04 11:29:48 +01:00
# The SentenceTransformer model "sentence-transformers/multi-qa-mpnet-base-dot-v1" generally works well with the EmbeddingRetriever on any kind of English text.
# For more information and suggestions on different models check out the documentation at: https://www.sbert.net/docs/pretrained_models.html
# from haystack.retriever import EmbeddingRetriever, DensePassageRetriever
# retriever = EmbeddingRetriever(document_store=document_store, model_format="sentence_transformers",
# embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1")
2021-01-13 18:17:54 +01:00
# retriever = DensePassageRetriever(document_store=document_store,
2021-06-02 12:09:03 +02:00
# query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
2021-01-13 18:17:54 +01:00
# passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
# use_gpu=True,
2021-10-22 16:03:12 +02:00
# max_seq_len_passage=256,
# embed_title=True)
2021-01-13 18:17:54 +01:00
# document_store.update_embeddings(retriever, index=doc_index)
# Initialize Reader
2022-02-03 13:43:18 +01:00
reader = FARMReader ( model_name_or_path = " deepset/roberta-base-squad2 " , top_k = 4 , return_no_answer = True )
2021-01-13 18:17:54 +01:00
2021-12-03 11:19:41 +01:00
# Define a pipeline consisting of the initialized retriever and reader
2022-02-24 17:45:40 +01:00
# Here we evaluate retriever and reader in an integrated (a.k.a. open domain) fashion on the full corpus of documents
# i.e. a document is considered
2021-12-03 11:19:41 +01:00
# correctly retrieved if it contains the gold answer string within it. The reader is evaluated based purely on the
# predicted answer string, regardless of which document this came from and the position of the extracted span.
# The generation of predictions is seperated from the calculation of metrics.
# This allows you to run the computation-heavy model predictions only once and then iterate flexibly on the metrics or reports you want to generate.
pipeline = ExtractiveQAPipeline ( reader = reader , retriever = retriever )
# The evaluation also works with any other pipeline.
# For example you could use a DocumentSearchPipeline as an alternative:
# pipeline = DocumentSearchPipeline(retriever=retriever)
# We can load evaluation labels from the document store
2022-02-24 17:45:40 +01:00
# We are also opting to filter out no_answer samples
2021-12-03 11:19:41 +01:00
eval_labels = document_store . get_all_labels_aggregated ( drop_negative_labels = True , drop_no_answers = False )
2022-02-24 17:45:40 +01:00
eval_labels = [ label for label in eval_labels if not label . no_answer ]
2021-12-03 11:19:41 +01:00
2022-02-24 17:45:40 +01:00
## Alternative: Define queries and labels directly
2021-12-03 11:19:41 +01:00
# eval_labels = [
2022-02-24 17:45:40 +01:00
# MultiLabel(
# labels=[
# Label(
# query="who is written in the book of life",
# answer=Answer(
# answer="every person who is destined for Heaven or the World to Come",
# offsets_in_context=[Span(374, 434)]
# ),
# document=Document(
# id='1b090aec7dbd1af6739c4c80f8995877-0',
# content_type="text",
# content='Book of Life - wikipedia Book of Life Jump to: navigation, search This article is
# about the book mentioned in Christian and Jewish religious teachings...'
# ),
# is_correct_answer=True,
# is_correct_document=True,
# origin="gold-label"
# )
# ]
# )
# ]
2021-12-03 11:19:41 +01:00
# Similar to pipeline.run() we can execute pipeline.eval()
2022-02-25 13:53:46 +01:00
eval_result = pipeline . eval ( labels = eval_labels , params = { " Retriever " : { " top_k " : 5 } } )
2021-12-03 11:19:41 +01:00
# The EvaluationResult contains a pandas dataframe for each pipeline node.
# That's why there are two dataframes in the EvaluationResult of an ExtractiveQAPipeline.
retriever_result = eval_result [ " Retriever " ]
retriever_result . head ( )
reader_result = eval_result [ " Reader " ]
reader_result . head ( )
# We can filter for all documents retrieved for a given query
2022-02-24 17:45:40 +01:00
query = " who is written in the book of life "
retriever_book_of_life = retriever_result [ retriever_result [ " query " ] == query ]
2021-12-03 11:19:41 +01:00
# We can also filter for all answers predicted for a given query
2022-02-03 13:43:18 +01:00
reader_book_of_life = reader_result [ reader_result [ " query " ] == " who is written in the book of life " ]
2021-12-03 11:19:41 +01:00
2022-02-24 17:45:40 +01:00
# Save the evaluation result so that we can reload it later
# and calculate evaluation metrics without running the pipeline again.
2021-12-03 11:19:41 +01:00
eval_result . save ( " ../ " )
## Calculating Evaluation Metrics
2022-02-24 17:45:40 +01:00
# Load an EvaluationResult to quickly calculate standard evaluation metrics for all predictions,
# such as F1-score of each individual prediction of the Reader node or recall of the retriever.
# To learn more about the metrics, see [Evaluation Metrics](https://haystack.deepset.ai/guides/evaluation#metrics-retrieval)
2021-12-03 11:19:41 +01:00
saved_eval_result = EvaluationResult . load ( " ../ " )
metrics = saved_eval_result . calculate_metrics ( )
print ( f ' Retriever - Recall (single relevant document): { metrics [ " Retriever " ] [ " recall_single_hit " ] } ' )
print ( f ' Retriever - Recall (multiple relevant documents): { metrics [ " Retriever " ] [ " recall_multi_hit " ] } ' )
print ( f ' Retriever - Mean Reciprocal Rank: { metrics [ " Retriever " ] [ " mrr " ] } ' )
print ( f ' Retriever - Precision: { metrics [ " Retriever " ] [ " precision " ] } ' )
print ( f ' Retriever - Mean Average Precision: { metrics [ " Retriever " ] [ " map " ] } ' )
print ( f ' Reader - F1-Score: { metrics [ " Reader " ] [ " f1 " ] } ' )
print ( f ' Reader - Exact Match: { metrics [ " Reader " ] [ " exact_match " ] } ' )
## Generating an Evaluation Report
2022-02-24 17:45:40 +01:00
# A summary of the evaluation results can be printed to get a quick overview.
# It includes some aggregated metrics and also shows a few wrongly predicted examples.
2021-12-03 11:19:41 +01:00
pipeline . print_eval_report ( saved_eval_result )
## Advanced Evaluation Metrics
2022-02-24 17:45:40 +01:00
# Semantic Answer Similarity (SAS) is an advanced evaluation metric can be calculated in Haystack.
# This metric takes into account whether the meaning of a predicted answer is similar to the annotated gold answer
# rather than just doing string comparison. To this end SAS relies on pre-trained models.
# For English, we recommend "cross-encoder/stsb-roberta-large", whereas for German we recommend "deepset/gbert-large-sts".
# A good multilingual model is "sentence-transformers/paraphrase-multilingual-mpnet-base-v2".
# More info on this metric can be found in our [paper](https://arxiv.org/abs/2108.06130)
# or in our [blog post](https://www.deepset.ai/blog/semantic-answer-similarity-to-evaluate-qa).
2021-12-03 11:19:41 +01:00
advanced_eval_result = pipeline . eval (
2022-02-03 13:43:18 +01:00
labels = eval_labels ,
params = { " Retriever " : { " top_k " : 1 } } ,
sas_model_name_or_path = " cross-encoder/stsb-roberta-large " ,
)
2021-12-03 11:19:41 +01:00
metrics = advanced_eval_result . calculate_metrics ( )
print ( metrics [ " Reader " ] [ " sas " ] )
2022-02-24 17:45:40 +01:00
## Isolated Evaluation Mode
# The isolated node evaluation uses labels as input to the Reader node instead of the output of the preceeding retriever node.
# Thereby, we can additionally calculate the upper bounds of the evaluation metrics of the Reader.
# Note that even with isolated evaluation enabled, integrated evaluation will still be running.
2022-01-14 16:29:18 +01:00
eval_result_with_upper_bounds = pipeline . eval (
2022-02-25 13:53:46 +01:00
labels = eval_labels , params = { " Retriever " : { " top_k " : 5 } } , add_isolated_node_eval = True
2022-01-14 16:29:18 +01:00
)
pipeline . print_eval_report ( eval_result_with_upper_bounds )
2021-12-03 11:19:41 +01:00
2022-01-14 16:29:18 +01:00
## Evaluation of Individual Components
2022-02-24 17:45:40 +01:00
# Sometimes you might want to evaluate individual components,
# for example, if you don't have a pipeline but only a retriever or a reader with a model that you trained yourself.
2021-12-03 11:19:41 +01:00
# Evaluate Retriever on its own
# Here we evaluate only the retriever, based on whether the gold_label document is retrieved.
2022-02-24 17:45:40 +01:00
# Note that no_answer samples are omitted when evaluation is performed with this method
2022-02-25 13:53:46 +01:00
retriever_eval_results = retriever . eval ( top_k = 5 , label_index = label_index , doc_index = doc_index )
2022-02-24 17:45:40 +01:00
2021-12-03 11:19:41 +01:00
## Retriever Recall is the proportion of questions for which the correct document containing the answer is
## among the correct documents
print ( " Retriever Recall: " , retriever_eval_results [ " recall " ] )
## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
print ( " Retriever Mean Avg Precision: " , retriever_eval_results [ " map " ] )
2022-02-24 17:45:40 +01:00
# Just as a sanity check, we can compare the recall from `retriever.eval()`
# with the multi hit recall from `pipeline.eval(add_isolated_node_eval=True)`.
# These two recall metrics are only comparable since we chose to filter out no_answer samples when generating eval_labels.
metrics = eval_result_with_upper_bounds . calculate_metrics ( )
print ( metrics [ " Retriever " ] [ " recall_multi_hit " ] )
2021-12-03 11:19:41 +01:00
# Evaluate Reader on its own
# Here we evaluate only the reader in a closed domain fashion i.e. the reader is given one query
# and its corresponding relevant document and metrics are calculated on whether the right position in this text is selected by
# the model as the answer span (i.e. SQuAD style)
2022-02-25 13:53:46 +01:00
reader_eval_results = reader . eval ( document_store = document_store , label_index = label_index , doc_index = doc_index )
2021-12-03 11:19:41 +01:00
# Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
2022-02-24 17:45:40 +01:00
# reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json")
2021-12-03 11:19:41 +01:00
## Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer
print ( " Reader Top-N-Accuracy: " , reader_eval_results [ " top_n_accuracy " ] )
## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
print ( " Reader Exact Match: " , reader_eval_results [ " EM " ] )
## Reader F1-Score is the average overlap between the predicted answers and the correct answers
print ( " Reader F1-Score: " , reader_eval_results [ " f1 " ] )
2021-01-13 18:17:54 +01:00
if __name__ == " __main__ " :
tutorial5_evaluation ( )
2021-06-11 11:09:15 +02:00
# This Haystack script was made with love by deepset in Berlin, Germany
# Haystack: https://github.com/deepset-ai/haystack
2022-02-03 13:43:18 +01:00
# deepset: https://deepset.ai/