mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-16 18:13:54 +00:00
498 lines
28 KiB
Python
498 lines
28 KiB
Python
import logging
|
|
import time
|
|
from copy import deepcopy
|
|
from statistics import mean
|
|
from typing import Optional, Dict, Any, List
|
|
from collections import defaultdict
|
|
|
|
from haystack.reader.base import BaseReader
|
|
from haystack.retriever.base import BaseRetriever
|
|
from haystack import MultiLabel
|
|
from haystack.eval import calculate_average_precision_and_reciprocal_rank, eval_counts_reader_batch, \
|
|
calculate_reader_metrics, eval_counts_reader
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class Finder:
|
|
"""
|
|
Finder ties together instances of the Reader and Retriever class.
|
|
|
|
It provides an interface to predict top n answers for a given question.
|
|
"""
|
|
|
|
def __init__(self, reader: Optional[BaseReader], retriever: Optional[BaseRetriever]):
|
|
"""
|
|
Initialize a Finder instance.
|
|
|
|
:param reader: Reader instance
|
|
:param retriever: Retriever instance
|
|
"""
|
|
logger.warning(
|
|
"""DEPRECATION WARNINGS:
|
|
1. The 'Finder' class will be deprecated in the next Haystack release in
|
|
favour of a new `Pipeline` class that supports building custom search pipelines using Haystack components
|
|
including Retriever, Readers, and Generators.
|
|
For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/544
|
|
2. The `question` parameter in search requests & results is renamed to `query`."""
|
|
)
|
|
self.retriever = retriever
|
|
self.reader = reader
|
|
if self.reader is None and self.retriever is None:
|
|
raise AttributeError("Finder: self.reader and self.retriever can not be both None")
|
|
|
|
def get_answers(self, question: str, top_k_reader: int = 1, top_k_retriever: int = 10, filters: Optional[dict] = None, index: str = None):
|
|
"""
|
|
Get top k answers for a given question.
|
|
|
|
:param question: The question string
|
|
:param top_k_reader: Number of answers returned by the reader
|
|
:param top_k_retriever: Number of text units to be retrieved
|
|
:param filters: Limit scope to documents having the given meta data values.
|
|
The format for the dict is `{"key-1": ["value-1", "value-2"], "key-2": ["value-3]" ...}``
|
|
:param index: Index to retrieve documents from
|
|
:return:
|
|
"""
|
|
|
|
logger.warning(
|
|
"""DEPRECATION WARNINGS:
|
|
1. The 'Finder' class will be deprecated in the next Haystack release in
|
|
favour of a new `Pipeline` class that supports building custom search pipelines using Haystack components
|
|
including Retriever, Readers, and Generators.
|
|
For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/544
|
|
2. The `question` parameter in search requests & results is renamed to `query`."""
|
|
)
|
|
if self.retriever is None or self.reader is None:
|
|
raise AttributeError("Finder.get_answers requires self.retriever AND self.reader")
|
|
|
|
# 1) Apply retriever(with optional filters) to get fast candidate documents
|
|
documents = self.retriever.retrieve(question, filters=filters, top_k=top_k_retriever, index=index)
|
|
logger.info(f"Got {len(documents)} candidates from retriever")
|
|
logger.debug(f"Retrieved document IDs: {[doc.id for doc in documents]}")
|
|
|
|
if len(documents) == 0:
|
|
logger.info("Retriever did not return any documents. Skipping reader ...")
|
|
empty_result = {"question": question, "answers": []}
|
|
return empty_result
|
|
|
|
# 2) Apply reader to get granular answer(s)
|
|
len_chars = sum([len(d.text) for d in documents])
|
|
logger.info(f"Reader is looking for detailed answer in {len_chars} chars ...")
|
|
|
|
results = self.reader.predict(query=question,
|
|
documents=documents,
|
|
top_k=top_k_reader) # type: Dict[str, Any]
|
|
results["question"] = results["query"]
|
|
|
|
# Add corresponding document_name and more meta data, if an answer contains the document_id
|
|
for ans in results["answers"]:
|
|
ans["meta"] = {}
|
|
for doc in documents:
|
|
if doc.id == ans["document_id"]:
|
|
ans["meta"] = deepcopy(doc.meta)
|
|
|
|
return results
|
|
|
|
def get_answers_via_similar_questions(self, question: str, top_k_retriever: int = 10, filters: Optional[dict] = None, index: str = None):
|
|
"""
|
|
Get top k answers for a given question using only a retriever.
|
|
|
|
:param question: The question string
|
|
:param top_k_retriever: Number of text units to be retrieved
|
|
:param filters: Limit scope to documents having the given meta data values.
|
|
The format for the dict is ``{"key-1": ["value-1", "value-2"], "key-2": ["value-3]" ...}``
|
|
:param index: Index to retrieve documents from
|
|
:return:
|
|
"""
|
|
|
|
if self.retriever is None:
|
|
raise AttributeError("Finder.get_answers_via_similar_questions requires self.retriever")
|
|
|
|
results = {"question": question, "answers": []} # type: Dict[str, Any]
|
|
|
|
|
|
# 1) Apply retriever to match similar questions via cosine similarity of embeddings
|
|
documents = self.retriever.retrieve(question, top_k=top_k_retriever, filters=filters, index=index)
|
|
|
|
# 2) Format response
|
|
for doc in documents:
|
|
#TODO proper calibratation of pseudo probabilities
|
|
cur_answer = {
|
|
"question": doc.question,
|
|
"answer": doc.text,
|
|
"document_id": doc.id,
|
|
"context": doc.text,
|
|
"score": doc.score,
|
|
"probability": doc.probability,
|
|
"offset_start": 0,
|
|
"offset_end": len(doc.text),
|
|
"meta": doc.meta
|
|
}
|
|
|
|
results["answers"].append(cur_answer)
|
|
|
|
return results
|
|
|
|
def eval(
|
|
self,
|
|
label_index: str,
|
|
doc_index: str,
|
|
label_origin: str = "gold_label",
|
|
top_k_retriever: int = 10,
|
|
top_k_reader: int = 10,
|
|
return_preds: bool = False,
|
|
):
|
|
"""
|
|
Evaluation of the whole pipeline by first evaluating the Retriever and then evaluating the Reader on the result
|
|
of the Retriever.
|
|
Returns a dict containing the following metrics:
|
|
- ``"retriever_recall"``: Proportion of questions for which correct document is among retrieved documents
|
|
- ``"retriever_map"``: Mean of average precision for each question. Rewards retrievers that give relevant
|
|
documents a higher rank. Considers all retrieved relevant documents. Average precision is normalized by
|
|
the number of all relevant documents per query.
|
|
- ``"retriever_mrr"``: Mean of reciprocal rank for each question. Rewards retrievers that give relevant
|
|
documents a higher rank. Only considers the highest ranked relevant document.
|
|
- ``"reader_top1_accuracy"``: Proportion of highest ranked predicted answers that overlap with corresponding correct answer
|
|
- ``"reader_top1_accuracy_has_answer"``: Proportion of highest ranked predicted answers that overlap
|
|
with corresponding correct answer for answerable questions
|
|
- ``"reader_top_k_accuracy"``: Proportion of predicted answers that overlap with corresponding correct answer
|
|
- ``"reader_topk_accuracy_has_answer"``: Proportion of predicted answers that overlap with corresponding correct answer
|
|
for answerable questions
|
|
- ``"reader_top1_em"``: Proportion of exact matches of highest ranked predicted answers with their corresponding
|
|
correct answers
|
|
- ``"reader_top1_em_has_answer"``: Proportion of exact matches of highest ranked predicted answers with their corresponding
|
|
correct answers for answerable questions
|
|
- ``"reader_topk_em"``: Proportion of exact matches of predicted answers with their corresponding correct answers
|
|
- ``"reader_topk_em_has_answer"``: Proportion of exact matches of predicted answers with their corresponding
|
|
correct answers for answerable questions
|
|
- ``"reader_top1_f1"``: Average overlap between highest ranked predicted answers and their corresponding correct answers
|
|
- ``"reader_top1_f1_has_answer"``: Average overlap between highest ranked predicted answers and their corresponding
|
|
correct answers for answerable questions
|
|
- ``"reader_topk_f1"``: Average overlap between predicted answers and their corresponding correct answers
|
|
- ``"reader_topk_f1_has_answer"``: Average overlap between predicted answers and their corresponding correct answers
|
|
for answerable questions
|
|
- ``"reader_top1_no_answer_accuracy"``: Proportion of correct predicting unanswerable question at highest ranked prediction
|
|
- ``"reader_topk_no_answer_accuracy"``: Proportion of correct predicting unanswerable question among all predictions
|
|
- ``"total_retrieve_time"``: Time retriever needed to retrieve documents for all questions
|
|
- ``"avg_retrieve_time"``: Average time needed to retrieve documents for one question
|
|
- ``"total_reader_time"``: Time reader needed to extract answer out of retrieved documents for all questions
|
|
where the correct document is among the retrieved ones
|
|
- ``"avg_reader_time"``: Average time needed to extract answer out of retrieved documents for one question
|
|
- ``"total_finder_time"``: Total time for whole pipeline
|
|
|
|
:param label_index: Elasticsearch index where labeled questions are stored
|
|
:type label_index: str
|
|
:param doc_index: Elasticsearch index where documents that are used for evaluation are stored
|
|
:type doc_index: str
|
|
:param top_k_retriever: How many documents per question to return and pass to reader
|
|
:type top_k_retriever: int
|
|
:param top_k_reader: How many answers to return per question
|
|
:type top_k_reader: int
|
|
:param return_preds: Whether to add predictions in the returned dictionary. If True, the returned dictionary
|
|
contains the keys "predictions" and "metrics".
|
|
:type return_preds: bool
|
|
"""
|
|
|
|
if not self.reader or not self.retriever:
|
|
raise Exception("Finder needs to have a reader and retriever for the evaluation.")
|
|
|
|
finder_start_time = time.time()
|
|
# extract all questions for evaluation
|
|
filters = {"origin": [label_origin]}
|
|
questions = self.retriever.document_store.get_all_labels_aggregated(index=label_index, filters=filters)
|
|
|
|
counts = defaultdict(float) # type: Dict[str, float]
|
|
retrieve_times = []
|
|
read_times = []
|
|
|
|
# retrieve documents
|
|
questions_with_docs = []
|
|
retriever_start_time = time.time()
|
|
for q_idx, question in enumerate(questions):
|
|
question_string = question.question
|
|
single_retrieve_start = time.time()
|
|
retrieved_docs = self.retriever.retrieve(question_string, top_k=top_k_retriever, index=doc_index)
|
|
retrieve_times.append(time.time() - single_retrieve_start)
|
|
number_relevant_docs = len(set(question.multiple_document_ids))
|
|
|
|
# check if correct doc among retrieved docs
|
|
found_relevant_doc = False
|
|
relevant_docs_found = 0
|
|
current_avg_precision = 0.0
|
|
for doc_idx, doc in enumerate(retrieved_docs):
|
|
if doc.id in question.multiple_document_ids:
|
|
relevant_docs_found += 1
|
|
if not found_relevant_doc:
|
|
counts["correct_retrievals"] += 1
|
|
counts["summed_reciprocal_rank_retriever"] += 1 / (doc_idx + 1)
|
|
current_avg_precision += relevant_docs_found / (doc_idx + 1)
|
|
|
|
found_relevant_doc = True
|
|
if relevant_docs_found == number_relevant_docs:
|
|
break
|
|
if found_relevant_doc:
|
|
all_relevant_docs = len(set(question.multiple_document_ids))
|
|
counts["summed_avg_precision_retriever"] += current_avg_precision / all_relevant_docs
|
|
|
|
if found_relevant_doc:
|
|
questions_with_docs.append({
|
|
"question": question,
|
|
"docs": retrieved_docs
|
|
})
|
|
|
|
retriever_total_time = time.time() - retriever_start_time
|
|
counts["number_of_questions"] = q_idx + 1
|
|
|
|
previous_return_no_answers = self.reader.return_no_answers
|
|
self.reader.return_no_answers = True
|
|
|
|
predictions = []
|
|
# extract answers
|
|
reader_start_time = time.time()
|
|
for q_idx, question_docs in enumerate(questions_with_docs):
|
|
if (q_idx + 1) % 100 == 0:
|
|
print(f"Processed {q_idx+1} questions.")
|
|
|
|
question = question_docs["question"] # type: ignore
|
|
question_string = question.question
|
|
docs = question_docs["docs"] # type: ignore
|
|
single_reader_start = time.time()
|
|
predicted_answers = self.reader.predict(question_string, docs, top_k=top_k_reader) # type: ignore
|
|
read_times.append(time.time() - single_reader_start)
|
|
if return_preds:
|
|
predictions.append(predicted_answers)
|
|
counts = eval_counts_reader(question, predicted_answers, counts)
|
|
|
|
counts["number_of_has_answer"] = counts["correct_retrievals"] - counts["number_of_no_answer"]
|
|
|
|
reader_total_time = time.time() - reader_start_time
|
|
finder_total_time = time.time() - finder_start_time
|
|
|
|
self.reader.return_no_answers = previous_return_no_answers # type: ignore
|
|
|
|
logger.info((f"{counts['correct_readings_topk']} out of {counts['number_of_questions']} questions were correctly"
|
|
f" answered {(counts['correct_readings_topk']/counts['number_of_questions']):.2%})."))
|
|
logger.info((f"{counts['number_of_questions']-counts['correct_retrievals']} questions could not be answered due "
|
|
f"to the retriever."))
|
|
logger.info((f"{counts['correct_retrievals']-counts['correct_readings_topk']} questions could not be answered "
|
|
f"due to the reader."))
|
|
|
|
eval_results = self.calc_eval_results(counts)
|
|
eval_results["total_retrieve_time"] = retriever_total_time
|
|
eval_results["avg_retrieve_time"] = mean(retrieve_times)
|
|
eval_results["total_reader_time"] = reader_total_time
|
|
eval_results["avg_reader_time"] = mean(read_times)
|
|
eval_results["total_finder_time"] = finder_total_time
|
|
|
|
if return_preds:
|
|
return {"metrics": eval_results, "predictions": predictions}
|
|
else:
|
|
return eval_results
|
|
|
|
def eval_batch(
|
|
self,
|
|
label_index: str,
|
|
doc_index : str,
|
|
label_origin: str = "gold_label",
|
|
top_k_retriever: int = 10,
|
|
top_k_reader: int = 10,
|
|
batch_size: int = 50,
|
|
return_preds: bool = False,
|
|
):
|
|
"""
|
|
Evaluation of the whole pipeline by first evaluating the Retriever and then evaluating the Reader on the result
|
|
of the Retriever. Passes all retrieved question-document pairs to the Reader at once.
|
|
Returns a dict containing the following metrics:
|
|
- ``"retriever_recall"``: Proportion of questions for which correct document is among retrieved documents
|
|
- ``"retriever_map"``: Mean of average precision for each question. Rewards retrievers that give relevant
|
|
documents a higher rank. Considers all retrieved relevant documents. Average precision is normalized by
|
|
the number of all relevant documents per query.
|
|
- ``"retriever_mrr"``: Mean of reciprocal rank for each question. Rewards retrievers that give relevant
|
|
documents a higher rank. Only considers the highest ranked relevant document.
|
|
- ``"reader_top1_accuracy"``: Proportion of highest ranked predicted answers that overlap with corresponding correct answer
|
|
- ``"reader_top1_accuracy_has_answer"``: Proportion of highest ranked predicted answers that overlap
|
|
with corresponding correct answer for answerable questions
|
|
- ``"reader_top_k_accuracy"``: Proportion of predicted answers that overlap with corresponding correct answer
|
|
- ``"reader_topk_accuracy_has_answer"``: Proportion of predicted answers that overlap with corresponding correct answer
|
|
for answerable questions
|
|
- ``"reader_top1_em"``: Proportion of exact matches of highest ranked predicted answers with their corresponding
|
|
correct answers
|
|
- ``"reader_top1_em_has_answer"``: Proportion of exact matches of highest ranked predicted answers with their corresponding
|
|
correct answers for answerable questions
|
|
- ``"reader_topk_em"``: Proportion of exact matches of predicted answers with their corresponding correct answers
|
|
- ``"reader_topk_em_has_answer"``: Proportion of exact matches of predicted answers with their corresponding
|
|
correct answers for answerable questions
|
|
- ``"reader_top1_f1"``: Average overlap between highest ranked predicted answers and their corresponding correct answers
|
|
- ``"reader_top1_f1_has_answer"``: Average overlap between highest ranked predicted answers and their corresponding
|
|
correct answers for answerable questions
|
|
- ``"reader_topk_f1"``: Average overlap between predicted answers and their corresponding correct answers
|
|
- ``"reader_topk_f1_has_answer"``: Average overlap between predicted answers and their corresponding correct answers
|
|
for answerable questions
|
|
- ``"reader_top1_no_answer_accuracy"``: Proportion of correct predicting unanswerable question at highest ranked prediction
|
|
- ``"reader_topk_no_answer_accuracy"``: Proportion of correct predicting unanswerable question among all predictions
|
|
- ``"total_retrieve_time"``: Time retriever needed to retrieve documents for all questions
|
|
- ``"avg_retrieve_time"``: Average time needed to retrieve documents for one question
|
|
- ``"total_reader_time"``: Time reader needed to extract answer out of retrieved documents for all questions
|
|
where the correct document is among the retrieved ones
|
|
- ``"avg_reader_time"``: Average time needed to extract answer out of retrieved documents for one question
|
|
- ``"total_finder_time"``: Total time for whole pipeline
|
|
|
|
:param label_index: Elasticsearch index where labeled questions are stored
|
|
:type label_index: str
|
|
:param doc_index: Elasticsearch index where documents that are used for evaluation are stored
|
|
:type doc_index: str
|
|
:param top_k_retriever: How many documents per question to return and pass to reader
|
|
:type top_k_retriever: int
|
|
:param top_k_reader: How many answers to return per question
|
|
:type top_k_reader: int
|
|
:param batch_size: Number of samples per batch computed at once
|
|
:type batch_size: int
|
|
:param return_preds: Whether to add predictions in the returned dictionary. If True, the returned dictionary
|
|
contains the keys "predictions" and "metrics".
|
|
:type return_preds: bool
|
|
"""
|
|
|
|
if not self.reader or not self.retriever:
|
|
raise Exception("Finder needs to have a reader and retriever for the evaluation.")
|
|
|
|
counts = defaultdict(float) # type: Dict[str, float]
|
|
finder_start_time = time.time()
|
|
|
|
# extract all questions for evaluation
|
|
filters = {"origin": [label_origin]}
|
|
questions = self.retriever.document_store.get_all_labels_aggregated(index=label_index, filters=filters)
|
|
number_of_questions = len(questions)
|
|
|
|
# retrieve documents
|
|
retriever_start_time = time.time()
|
|
questions_with_docs = self._retrieve_docs(questions, top_k=top_k_retriever, doc_index=doc_index)
|
|
retriever_total_time = time.time() - retriever_start_time
|
|
|
|
questions_with_correct_doc, \
|
|
summed_avg_precision_retriever, \
|
|
summed_reciprocal_rank_retriever = calculate_average_precision_and_reciprocal_rank(questions_with_docs)
|
|
|
|
correct_retrievals = len(questions_with_correct_doc)
|
|
|
|
# extract answers
|
|
previous_return_no_answers = self.reader.return_no_answers
|
|
self.reader.return_no_answers = True
|
|
reader_start_time = time.time()
|
|
predictions = self.reader.predict_batch(questions_with_correct_doc,
|
|
top_k=top_k_reader, batch_size=batch_size)
|
|
reader_total_time = time.time() - reader_start_time
|
|
|
|
for pred in predictions:
|
|
counts = eval_counts_reader_batch(pred, counts)
|
|
|
|
finder_total_time = time.time() - finder_start_time
|
|
|
|
results = calculate_reader_metrics(counts, correct_retrievals)
|
|
results["retriever_recall"] = correct_retrievals / number_of_questions
|
|
results["retriever_map"] = summed_avg_precision_retriever / number_of_questions
|
|
results["retriever_mrr"] = summed_reciprocal_rank_retriever / number_of_questions
|
|
results["total_retrieve_time"] = retriever_total_time
|
|
results["avg_retrieve_time"] = retriever_total_time / number_of_questions
|
|
results["total_reader_time"] = reader_total_time
|
|
results["avg_reader_time"] = reader_total_time / correct_retrievals
|
|
results["total_finder_time"] = finder_total_time
|
|
|
|
logger.info((f"{counts['correct_readings_topk']} out of {number_of_questions} questions were correctly "
|
|
f"answered ({(counts['correct_readings_topk'] / number_of_questions):.2%})."))
|
|
logger.info(f"{number_of_questions - correct_retrievals} questions could not be answered due to the retriever.")
|
|
logger.info(f"{correct_retrievals - counts['correct_readings_topk']} questions could not be answered due to the reader.")
|
|
|
|
if return_preds:
|
|
return {"metrics": results, "predictions": predictions}
|
|
else:
|
|
return results
|
|
|
|
|
|
def _retrieve_docs(self, questions: List[MultiLabel], top_k: int, doc_index: str):
|
|
# Retrieves documents for a list of Labels (= questions)
|
|
questions_with_docs = []
|
|
|
|
for question in questions:
|
|
question_string = question.question
|
|
retrieved_docs = self.retriever.retrieve(question_string, top_k=top_k, index=doc_index) # type: ignore
|
|
questions_with_docs.append({
|
|
"question": question,
|
|
"docs": retrieved_docs
|
|
})
|
|
|
|
return questions_with_docs
|
|
|
|
|
|
@staticmethod
|
|
def print_eval_results(finder_eval_results: Dict):
|
|
if "predictions" in finder_eval_results.keys():
|
|
finder_eval_results = finder_eval_results["metrics"]
|
|
|
|
print("\n___Retriever Metrics in Finder___")
|
|
print(f"Retriever Recall : {finder_eval_results['retriever_recall']:.3f}")
|
|
print(f"Retriever Mean Avg Precision: {finder_eval_results['retriever_map']:.3f}")
|
|
print(f"Retriever Mean Reciprocal Rank: {finder_eval_results['retriever_mrr']:.3f}")
|
|
|
|
# Reader is only evaluated with those questions, where the correct document is among the retrieved ones
|
|
print("\n___Reader Metrics in Finder___")
|
|
print("Top-k accuracy")
|
|
print(f"Reader Top-1 accuracy : {finder_eval_results['reader_top1_accuracy']:.3f}")
|
|
print(f"Reader Top-1 accuracy (has answer): {finder_eval_results['reader_top1_accuracy_has_answer']:.3f}")
|
|
print(f"Reader Top-k accuracy : {finder_eval_results['reader_topk_accuracy']:.3f}")
|
|
print(f"Reader Top-k accuracy (has answer): {finder_eval_results['reader_topk_accuracy_has_answer']:.3f}")
|
|
print("Exact Match")
|
|
print(f"Reader Top-1 EM : {finder_eval_results['reader_top1_em']:.3f}")
|
|
print(f"Reader Top-1 EM (has answer) : {finder_eval_results['reader_top1_em_has_answer']:.3f}")
|
|
print(f"Reader Top-k EM : {finder_eval_results['reader_topk_em']:.3f}")
|
|
print(f"Reader Top-k EM (has answer) : {finder_eval_results['reader_topk_em_has_answer']:.3f}")
|
|
print("F1 score")
|
|
print(f"Reader Top-1 F1 : {finder_eval_results['reader_top1_f1']:.3f}")
|
|
print(f"Reader Top-1 F1 (has answer) : {finder_eval_results['reader_top1_f1_has_answer']:.3f}")
|
|
print(f"Reader Top-k F1 : {finder_eval_results['reader_topk_f1']:.3f}")
|
|
print(f"Reader Top-k F1 (has answer) : {finder_eval_results['reader_topk_f1_has_answer']:.3f}")
|
|
if finder_eval_results['reader_top1_no_answer_accuracy']:
|
|
print("No Answer")
|
|
print(f"Reader Top-1 no-answer accuracy : {finder_eval_results['reader_top1_no_answer_accuracy']:.3f}")
|
|
print(f"Reader Top-k no-answer accuracy : {finder_eval_results['reader_topk_no_answer_accuracy']:.3f}")
|
|
|
|
# Time measurements
|
|
print("\n___Time Measurements___")
|
|
print(f"Total retrieve time : {finder_eval_results['total_retrieve_time']:.3f}")
|
|
print(f"Avg retrieve time per question: {finder_eval_results['avg_retrieve_time']:.3f}")
|
|
print(f"Total reader timer : {finder_eval_results['total_reader_time']:.3f}")
|
|
print(f"Avg read time per question : {finder_eval_results['avg_reader_time']:.3f}")
|
|
print(f"Total Finder time : {finder_eval_results['total_finder_time']:.3f}")
|
|
|
|
@staticmethod
|
|
def calc_eval_results(eval_counts: Dict):
|
|
eval_results = {}
|
|
number_of_questions = eval_counts["number_of_questions"]
|
|
correct_retrievals = eval_counts["correct_retrievals"]
|
|
number_of_has_answer = eval_counts["number_of_has_answer"]
|
|
number_of_no_answer = eval_counts["number_of_no_answer"]
|
|
|
|
eval_results["retriever_recall"] = eval_counts["correct_retrievals"] / number_of_questions
|
|
eval_results["retriever_map"] = eval_counts["summed_avg_precision_retriever"] / number_of_questions
|
|
eval_results["retriever_mrr"] = eval_counts["summed_reciprocal_rank_retriever"] / number_of_questions
|
|
|
|
eval_results["reader_top1_accuracy"] = eval_counts["correct_readings_top1"] / correct_retrievals
|
|
eval_results["reader_top1_accuracy_has_answer"] = eval_counts["correct_readings_top1_has_answer"] / number_of_has_answer
|
|
eval_results["reader_topk_accuracy"] = eval_counts["correct_readings_topk"] / correct_retrievals
|
|
eval_results["reader_topk_accuracy_has_answer"] = eval_counts["correct_readings_topk_has_answer"] / number_of_has_answer
|
|
eval_results["reader_top1_em"] = eval_counts["exact_matches_top1"] / correct_retrievals
|
|
eval_results["reader_top1_em_has_answer"] = eval_counts["exact_matches_top1_has_answer"] / number_of_has_answer
|
|
eval_results["reader_topk_em"] = eval_counts["exact_matches_topk"] / correct_retrievals
|
|
eval_results["reader_topk_em_has_answer"] = eval_counts["exact_matches_topk_has_answer"] / number_of_has_answer
|
|
eval_results["reader_top1_f1"] = eval_counts["summed_f1_top1"] / correct_retrievals
|
|
eval_results["reader_top1_f1_has_answer"] = eval_counts["summed_f1_top1_has_answer"] / number_of_has_answer
|
|
eval_results["reader_topk_f1"] = eval_counts["summed_f1_topk"] / correct_retrievals
|
|
eval_results["reader_topk_f1_has_answer"] = eval_counts["summed_f1_topk_has_answer"] / number_of_has_answer
|
|
if number_of_no_answer:
|
|
eval_results["reader_top1_no_answer_accuracy"] = eval_counts["correct_no_answers_top1"] / number_of_no_answer
|
|
eval_results["reader_topk_no_answer_accuracy"] = eval_counts["correct_no_answers_topk"] / number_of_no_answer
|
|
else:
|
|
eval_results["reader_top1_no_answer_accuracy"] = None
|
|
eval_results["reader_topk_no_answer_accuracy"] = None
|
|
|
|
return eval_results
|