haystack/haystack/finder.py
2020-11-30 17:50:04 +01:00

498 lines
28 KiB
Python

import logging
import time
from copy import deepcopy
from statistics import mean
from typing import Optional, Dict, Any, List
from collections import defaultdict
from haystack.reader.base import BaseReader
from haystack.retriever.base import BaseRetriever
from haystack import MultiLabel
from haystack.eval import calculate_average_precision_and_reciprocal_rank, eval_counts_reader_batch, \
calculate_reader_metrics, eval_counts_reader
logger = logging.getLogger(__name__)
class Finder:
"""
Finder ties together instances of the Reader and Retriever class.
It provides an interface to predict top n answers for a given question.
"""
def __init__(self, reader: Optional[BaseReader], retriever: Optional[BaseRetriever]):
"""
Initialize a Finder instance.
:param reader: Reader instance
:param retriever: Retriever instance
"""
logger.warning(
"""DEPRECATION WARNINGS:
1. The 'Finder' class will be deprecated in the next Haystack release in
favour of a new `Pipeline` class that supports building custom search pipelines using Haystack components
including Retriever, Readers, and Generators.
For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/544
2. The `question` parameter in search requests & results is renamed to `query`."""
)
self.retriever = retriever
self.reader = reader
if self.reader is None and self.retriever is None:
raise AttributeError("Finder: self.reader and self.retriever can not be both None")
def get_answers(self, question: str, top_k_reader: int = 1, top_k_retriever: int = 10, filters: Optional[dict] = None, index: str = None):
"""
Get top k answers for a given question.
:param question: The question string
:param top_k_reader: Number of answers returned by the reader
:param top_k_retriever: Number of text units to be retrieved
:param filters: Limit scope to documents having the given meta data values.
The format for the dict is `{"key-1": ["value-1", "value-2"], "key-2": ["value-3]" ...}``
:param index: Index to retrieve documents from
:return:
"""
logger.warning(
"""DEPRECATION WARNINGS:
1. The 'Finder' class will be deprecated in the next Haystack release in
favour of a new `Pipeline` class that supports building custom search pipelines using Haystack components
including Retriever, Readers, and Generators.
For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/544
2. The `question` parameter in search requests & results is renamed to `query`."""
)
if self.retriever is None or self.reader is None:
raise AttributeError("Finder.get_answers requires self.retriever AND self.reader")
# 1) Apply retriever(with optional filters) to get fast candidate documents
documents = self.retriever.retrieve(question, filters=filters, top_k=top_k_retriever, index=index)
logger.info(f"Got {len(documents)} candidates from retriever")
logger.debug(f"Retrieved document IDs: {[doc.id for doc in documents]}")
if len(documents) == 0:
logger.info("Retriever did not return any documents. Skipping reader ...")
empty_result = {"question": question, "answers": []}
return empty_result
# 2) Apply reader to get granular answer(s)
len_chars = sum([len(d.text) for d in documents])
logger.info(f"Reader is looking for detailed answer in {len_chars} chars ...")
results = self.reader.predict(query=question,
documents=documents,
top_k=top_k_reader) # type: Dict[str, Any]
results["question"] = results["query"]
# Add corresponding document_name and more meta data, if an answer contains the document_id
for ans in results["answers"]:
ans["meta"] = {}
for doc in documents:
if doc.id == ans["document_id"]:
ans["meta"] = deepcopy(doc.meta)
return results
def get_answers_via_similar_questions(self, question: str, top_k_retriever: int = 10, filters: Optional[dict] = None, index: str = None):
"""
Get top k answers for a given question using only a retriever.
:param question: The question string
:param top_k_retriever: Number of text units to be retrieved
:param filters: Limit scope to documents having the given meta data values.
The format for the dict is ``{"key-1": ["value-1", "value-2"], "key-2": ["value-3]" ...}``
:param index: Index to retrieve documents from
:return:
"""
if self.retriever is None:
raise AttributeError("Finder.get_answers_via_similar_questions requires self.retriever")
results = {"question": question, "answers": []} # type: Dict[str, Any]
# 1) Apply retriever to match similar questions via cosine similarity of embeddings
documents = self.retriever.retrieve(question, top_k=top_k_retriever, filters=filters, index=index)
# 2) Format response
for doc in documents:
#TODO proper calibratation of pseudo probabilities
cur_answer = {
"question": doc.question,
"answer": doc.text,
"document_id": doc.id,
"context": doc.text,
"score": doc.score,
"probability": doc.probability,
"offset_start": 0,
"offset_end": len(doc.text),
"meta": doc.meta
}
results["answers"].append(cur_answer)
return results
def eval(
self,
label_index: str,
doc_index: str,
label_origin: str = "gold_label",
top_k_retriever: int = 10,
top_k_reader: int = 10,
return_preds: bool = False,
):
"""
Evaluation of the whole pipeline by first evaluating the Retriever and then evaluating the Reader on the result
of the Retriever.
Returns a dict containing the following metrics:
- ``"retriever_recall"``: Proportion of questions for which correct document is among retrieved documents
- ``"retriever_map"``: Mean of average precision for each question. Rewards retrievers that give relevant
documents a higher rank. Considers all retrieved relevant documents. Average precision is normalized by
the number of all relevant documents per query.
- ``"retriever_mrr"``: Mean of reciprocal rank for each question. Rewards retrievers that give relevant
documents a higher rank. Only considers the highest ranked relevant document.
- ``"reader_top1_accuracy"``: Proportion of highest ranked predicted answers that overlap with corresponding correct answer
- ``"reader_top1_accuracy_has_answer"``: Proportion of highest ranked predicted answers that overlap
with corresponding correct answer for answerable questions
- ``"reader_top_k_accuracy"``: Proportion of predicted answers that overlap with corresponding correct answer
- ``"reader_topk_accuracy_has_answer"``: Proportion of predicted answers that overlap with corresponding correct answer
for answerable questions
- ``"reader_top1_em"``: Proportion of exact matches of highest ranked predicted answers with their corresponding
correct answers
- ``"reader_top1_em_has_answer"``: Proportion of exact matches of highest ranked predicted answers with their corresponding
correct answers for answerable questions
- ``"reader_topk_em"``: Proportion of exact matches of predicted answers with their corresponding correct answers
- ``"reader_topk_em_has_answer"``: Proportion of exact matches of predicted answers with their corresponding
correct answers for answerable questions
- ``"reader_top1_f1"``: Average overlap between highest ranked predicted answers and their corresponding correct answers
- ``"reader_top1_f1_has_answer"``: Average overlap between highest ranked predicted answers and their corresponding
correct answers for answerable questions
- ``"reader_topk_f1"``: Average overlap between predicted answers and their corresponding correct answers
- ``"reader_topk_f1_has_answer"``: Average overlap between predicted answers and their corresponding correct answers
for answerable questions
- ``"reader_top1_no_answer_accuracy"``: Proportion of correct predicting unanswerable question at highest ranked prediction
- ``"reader_topk_no_answer_accuracy"``: Proportion of correct predicting unanswerable question among all predictions
- ``"total_retrieve_time"``: Time retriever needed to retrieve documents for all questions
- ``"avg_retrieve_time"``: Average time needed to retrieve documents for one question
- ``"total_reader_time"``: Time reader needed to extract answer out of retrieved documents for all questions
where the correct document is among the retrieved ones
- ``"avg_reader_time"``: Average time needed to extract answer out of retrieved documents for one question
- ``"total_finder_time"``: Total time for whole pipeline
:param label_index: Elasticsearch index where labeled questions are stored
:type label_index: str
:param doc_index: Elasticsearch index where documents that are used for evaluation are stored
:type doc_index: str
:param top_k_retriever: How many documents per question to return and pass to reader
:type top_k_retriever: int
:param top_k_reader: How many answers to return per question
:type top_k_reader: int
:param return_preds: Whether to add predictions in the returned dictionary. If True, the returned dictionary
contains the keys "predictions" and "metrics".
:type return_preds: bool
"""
if not self.reader or not self.retriever:
raise Exception("Finder needs to have a reader and retriever for the evaluation.")
finder_start_time = time.time()
# extract all questions for evaluation
filters = {"origin": [label_origin]}
questions = self.retriever.document_store.get_all_labels_aggregated(index=label_index, filters=filters)
counts = defaultdict(float) # type: Dict[str, float]
retrieve_times = []
read_times = []
# retrieve documents
questions_with_docs = []
retriever_start_time = time.time()
for q_idx, question in enumerate(questions):
question_string = question.question
single_retrieve_start = time.time()
retrieved_docs = self.retriever.retrieve(question_string, top_k=top_k_retriever, index=doc_index)
retrieve_times.append(time.time() - single_retrieve_start)
number_relevant_docs = len(set(question.multiple_document_ids))
# check if correct doc among retrieved docs
found_relevant_doc = False
relevant_docs_found = 0
current_avg_precision = 0.0
for doc_idx, doc in enumerate(retrieved_docs):
if doc.id in question.multiple_document_ids:
relevant_docs_found += 1
if not found_relevant_doc:
counts["correct_retrievals"] += 1
counts["summed_reciprocal_rank_retriever"] += 1 / (doc_idx + 1)
current_avg_precision += relevant_docs_found / (doc_idx + 1)
found_relevant_doc = True
if relevant_docs_found == number_relevant_docs:
break
if found_relevant_doc:
all_relevant_docs = len(set(question.multiple_document_ids))
counts["summed_avg_precision_retriever"] += current_avg_precision / all_relevant_docs
if found_relevant_doc:
questions_with_docs.append({
"question": question,
"docs": retrieved_docs
})
retriever_total_time = time.time() - retriever_start_time
counts["number_of_questions"] = q_idx + 1
previous_return_no_answers = self.reader.return_no_answers
self.reader.return_no_answers = True
predictions = []
# extract answers
reader_start_time = time.time()
for q_idx, question_docs in enumerate(questions_with_docs):
if (q_idx + 1) % 100 == 0:
print(f"Processed {q_idx+1} questions.")
question = question_docs["question"] # type: ignore
question_string = question.question
docs = question_docs["docs"] # type: ignore
single_reader_start = time.time()
predicted_answers = self.reader.predict(question_string, docs, top_k=top_k_reader) # type: ignore
read_times.append(time.time() - single_reader_start)
if return_preds:
predictions.append(predicted_answers)
counts = eval_counts_reader(question, predicted_answers, counts)
counts["number_of_has_answer"] = counts["correct_retrievals"] - counts["number_of_no_answer"]
reader_total_time = time.time() - reader_start_time
finder_total_time = time.time() - finder_start_time
self.reader.return_no_answers = previous_return_no_answers # type: ignore
logger.info((f"{counts['correct_readings_topk']} out of {counts['number_of_questions']} questions were correctly"
f" answered {(counts['correct_readings_topk']/counts['number_of_questions']):.2%})."))
logger.info((f"{counts['number_of_questions']-counts['correct_retrievals']} questions could not be answered due "
f"to the retriever."))
logger.info((f"{counts['correct_retrievals']-counts['correct_readings_topk']} questions could not be answered "
f"due to the reader."))
eval_results = self.calc_eval_results(counts)
eval_results["total_retrieve_time"] = retriever_total_time
eval_results["avg_retrieve_time"] = mean(retrieve_times)
eval_results["total_reader_time"] = reader_total_time
eval_results["avg_reader_time"] = mean(read_times)
eval_results["total_finder_time"] = finder_total_time
if return_preds:
return {"metrics": eval_results, "predictions": predictions}
else:
return eval_results
def eval_batch(
self,
label_index: str,
doc_index : str,
label_origin: str = "gold_label",
top_k_retriever: int = 10,
top_k_reader: int = 10,
batch_size: int = 50,
return_preds: bool = False,
):
"""
Evaluation of the whole pipeline by first evaluating the Retriever and then evaluating the Reader on the result
of the Retriever. Passes all retrieved question-document pairs to the Reader at once.
Returns a dict containing the following metrics:
- ``"retriever_recall"``: Proportion of questions for which correct document is among retrieved documents
- ``"retriever_map"``: Mean of average precision for each question. Rewards retrievers that give relevant
documents a higher rank. Considers all retrieved relevant documents. Average precision is normalized by
the number of all relevant documents per query.
- ``"retriever_mrr"``: Mean of reciprocal rank for each question. Rewards retrievers that give relevant
documents a higher rank. Only considers the highest ranked relevant document.
- ``"reader_top1_accuracy"``: Proportion of highest ranked predicted answers that overlap with corresponding correct answer
- ``"reader_top1_accuracy_has_answer"``: Proportion of highest ranked predicted answers that overlap
with corresponding correct answer for answerable questions
- ``"reader_top_k_accuracy"``: Proportion of predicted answers that overlap with corresponding correct answer
- ``"reader_topk_accuracy_has_answer"``: Proportion of predicted answers that overlap with corresponding correct answer
for answerable questions
- ``"reader_top1_em"``: Proportion of exact matches of highest ranked predicted answers with their corresponding
correct answers
- ``"reader_top1_em_has_answer"``: Proportion of exact matches of highest ranked predicted answers with their corresponding
correct answers for answerable questions
- ``"reader_topk_em"``: Proportion of exact matches of predicted answers with their corresponding correct answers
- ``"reader_topk_em_has_answer"``: Proportion of exact matches of predicted answers with their corresponding
correct answers for answerable questions
- ``"reader_top1_f1"``: Average overlap between highest ranked predicted answers and their corresponding correct answers
- ``"reader_top1_f1_has_answer"``: Average overlap between highest ranked predicted answers and their corresponding
correct answers for answerable questions
- ``"reader_topk_f1"``: Average overlap between predicted answers and their corresponding correct answers
- ``"reader_topk_f1_has_answer"``: Average overlap between predicted answers and their corresponding correct answers
for answerable questions
- ``"reader_top1_no_answer_accuracy"``: Proportion of correct predicting unanswerable question at highest ranked prediction
- ``"reader_topk_no_answer_accuracy"``: Proportion of correct predicting unanswerable question among all predictions
- ``"total_retrieve_time"``: Time retriever needed to retrieve documents for all questions
- ``"avg_retrieve_time"``: Average time needed to retrieve documents for one question
- ``"total_reader_time"``: Time reader needed to extract answer out of retrieved documents for all questions
where the correct document is among the retrieved ones
- ``"avg_reader_time"``: Average time needed to extract answer out of retrieved documents for one question
- ``"total_finder_time"``: Total time for whole pipeline
:param label_index: Elasticsearch index where labeled questions are stored
:type label_index: str
:param doc_index: Elasticsearch index where documents that are used for evaluation are stored
:type doc_index: str
:param top_k_retriever: How many documents per question to return and pass to reader
:type top_k_retriever: int
:param top_k_reader: How many answers to return per question
:type top_k_reader: int
:param batch_size: Number of samples per batch computed at once
:type batch_size: int
:param return_preds: Whether to add predictions in the returned dictionary. If True, the returned dictionary
contains the keys "predictions" and "metrics".
:type return_preds: bool
"""
if not self.reader or not self.retriever:
raise Exception("Finder needs to have a reader and retriever for the evaluation.")
counts = defaultdict(float) # type: Dict[str, float]
finder_start_time = time.time()
# extract all questions for evaluation
filters = {"origin": [label_origin]}
questions = self.retriever.document_store.get_all_labels_aggregated(index=label_index, filters=filters)
number_of_questions = len(questions)
# retrieve documents
retriever_start_time = time.time()
questions_with_docs = self._retrieve_docs(questions, top_k=top_k_retriever, doc_index=doc_index)
retriever_total_time = time.time() - retriever_start_time
questions_with_correct_doc, \
summed_avg_precision_retriever, \
summed_reciprocal_rank_retriever = calculate_average_precision_and_reciprocal_rank(questions_with_docs)
correct_retrievals = len(questions_with_correct_doc)
# extract answers
previous_return_no_answers = self.reader.return_no_answers
self.reader.return_no_answers = True
reader_start_time = time.time()
predictions = self.reader.predict_batch(questions_with_correct_doc,
top_k=top_k_reader, batch_size=batch_size)
reader_total_time = time.time() - reader_start_time
for pred in predictions:
counts = eval_counts_reader_batch(pred, counts)
finder_total_time = time.time() - finder_start_time
results = calculate_reader_metrics(counts, correct_retrievals)
results["retriever_recall"] = correct_retrievals / number_of_questions
results["retriever_map"] = summed_avg_precision_retriever / number_of_questions
results["retriever_mrr"] = summed_reciprocal_rank_retriever / number_of_questions
results["total_retrieve_time"] = retriever_total_time
results["avg_retrieve_time"] = retriever_total_time / number_of_questions
results["total_reader_time"] = reader_total_time
results["avg_reader_time"] = reader_total_time / correct_retrievals
results["total_finder_time"] = finder_total_time
logger.info((f"{counts['correct_readings_topk']} out of {number_of_questions} questions were correctly "
f"answered ({(counts['correct_readings_topk'] / number_of_questions):.2%})."))
logger.info(f"{number_of_questions - correct_retrievals} questions could not be answered due to the retriever.")
logger.info(f"{correct_retrievals - counts['correct_readings_topk']} questions could not be answered due to the reader.")
if return_preds:
return {"metrics": results, "predictions": predictions}
else:
return results
def _retrieve_docs(self, questions: List[MultiLabel], top_k: int, doc_index: str):
# Retrieves documents for a list of Labels (= questions)
questions_with_docs = []
for question in questions:
question_string = question.question
retrieved_docs = self.retriever.retrieve(question_string, top_k=top_k, index=doc_index) # type: ignore
questions_with_docs.append({
"question": question,
"docs": retrieved_docs
})
return questions_with_docs
@staticmethod
def print_eval_results(finder_eval_results: Dict):
if "predictions" in finder_eval_results.keys():
finder_eval_results = finder_eval_results["metrics"]
print("\n___Retriever Metrics in Finder___")
print(f"Retriever Recall : {finder_eval_results['retriever_recall']:.3f}")
print(f"Retriever Mean Avg Precision: {finder_eval_results['retriever_map']:.3f}")
print(f"Retriever Mean Reciprocal Rank: {finder_eval_results['retriever_mrr']:.3f}")
# Reader is only evaluated with those questions, where the correct document is among the retrieved ones
print("\n___Reader Metrics in Finder___")
print("Top-k accuracy")
print(f"Reader Top-1 accuracy : {finder_eval_results['reader_top1_accuracy']:.3f}")
print(f"Reader Top-1 accuracy (has answer): {finder_eval_results['reader_top1_accuracy_has_answer']:.3f}")
print(f"Reader Top-k accuracy : {finder_eval_results['reader_topk_accuracy']:.3f}")
print(f"Reader Top-k accuracy (has answer): {finder_eval_results['reader_topk_accuracy_has_answer']:.3f}")
print("Exact Match")
print(f"Reader Top-1 EM : {finder_eval_results['reader_top1_em']:.3f}")
print(f"Reader Top-1 EM (has answer) : {finder_eval_results['reader_top1_em_has_answer']:.3f}")
print(f"Reader Top-k EM : {finder_eval_results['reader_topk_em']:.3f}")
print(f"Reader Top-k EM (has answer) : {finder_eval_results['reader_topk_em_has_answer']:.3f}")
print("F1 score")
print(f"Reader Top-1 F1 : {finder_eval_results['reader_top1_f1']:.3f}")
print(f"Reader Top-1 F1 (has answer) : {finder_eval_results['reader_top1_f1_has_answer']:.3f}")
print(f"Reader Top-k F1 : {finder_eval_results['reader_topk_f1']:.3f}")
print(f"Reader Top-k F1 (has answer) : {finder_eval_results['reader_topk_f1_has_answer']:.3f}")
if finder_eval_results['reader_top1_no_answer_accuracy']:
print("No Answer")
print(f"Reader Top-1 no-answer accuracy : {finder_eval_results['reader_top1_no_answer_accuracy']:.3f}")
print(f"Reader Top-k no-answer accuracy : {finder_eval_results['reader_topk_no_answer_accuracy']:.3f}")
# Time measurements
print("\n___Time Measurements___")
print(f"Total retrieve time : {finder_eval_results['total_retrieve_time']:.3f}")
print(f"Avg retrieve time per question: {finder_eval_results['avg_retrieve_time']:.3f}")
print(f"Total reader timer : {finder_eval_results['total_reader_time']:.3f}")
print(f"Avg read time per question : {finder_eval_results['avg_reader_time']:.3f}")
print(f"Total Finder time : {finder_eval_results['total_finder_time']:.3f}")
@staticmethod
def calc_eval_results(eval_counts: Dict):
eval_results = {}
number_of_questions = eval_counts["number_of_questions"]
correct_retrievals = eval_counts["correct_retrievals"]
number_of_has_answer = eval_counts["number_of_has_answer"]
number_of_no_answer = eval_counts["number_of_no_answer"]
eval_results["retriever_recall"] = eval_counts["correct_retrievals"] / number_of_questions
eval_results["retriever_map"] = eval_counts["summed_avg_precision_retriever"] / number_of_questions
eval_results["retriever_mrr"] = eval_counts["summed_reciprocal_rank_retriever"] / number_of_questions
eval_results["reader_top1_accuracy"] = eval_counts["correct_readings_top1"] / correct_retrievals
eval_results["reader_top1_accuracy_has_answer"] = eval_counts["correct_readings_top1_has_answer"] / number_of_has_answer
eval_results["reader_topk_accuracy"] = eval_counts["correct_readings_topk"] / correct_retrievals
eval_results["reader_topk_accuracy_has_answer"] = eval_counts["correct_readings_topk_has_answer"] / number_of_has_answer
eval_results["reader_top1_em"] = eval_counts["exact_matches_top1"] / correct_retrievals
eval_results["reader_top1_em_has_answer"] = eval_counts["exact_matches_top1_has_answer"] / number_of_has_answer
eval_results["reader_topk_em"] = eval_counts["exact_matches_topk"] / correct_retrievals
eval_results["reader_topk_em_has_answer"] = eval_counts["exact_matches_topk_has_answer"] / number_of_has_answer
eval_results["reader_top1_f1"] = eval_counts["summed_f1_top1"] / correct_retrievals
eval_results["reader_top1_f1_has_answer"] = eval_counts["summed_f1_top1_has_answer"] / number_of_has_answer
eval_results["reader_topk_f1"] = eval_counts["summed_f1_topk"] / correct_retrievals
eval_results["reader_topk_f1_has_answer"] = eval_counts["summed_f1_topk_has_answer"] / number_of_has_answer
if number_of_no_answer:
eval_results["reader_top1_no_answer_accuracy"] = eval_counts["correct_no_answers_top1"] / number_of_no_answer
eval_results["reader_topk_no_answer_accuracy"] = eval_counts["correct_no_answers_topk"] / number_of_no_answer
else:
eval_results["reader_top1_no_answer_accuracy"] = None
eval_results["reader_topk_no_answer_accuracy"] = None
return eval_results