mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-14 16:47:06 +00:00
Add nDCG to pipeline.eval()'s document metrics (#2008)
* add ndcg metric * fix merge * Add latest docstring and tutorial changes Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
2c063e960e
commit
f42d2e8ba0
@ -301,6 +301,7 @@ pipeline.print_eval_report(eval_result_with_upper_bounds)
|
|||||||
```
|
```
|
||||||
|
|
||||||
## Evaluation of Individual Components: Retriever
|
## Evaluation of Individual Components: Retriever
|
||||||
|
Sometimes you might want to evaluate individual components, for example, if you don't have a pipeline but only a retriever or a reader with a model that you trained yourself.
|
||||||
Here we evaluate only the retriever, based on whether the gold_label document is retrieved.
|
Here we evaluate only the retriever, based on whether the gold_label document is retrieved.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -947,13 +947,17 @@ class EvaluationResult:
|
|||||||
recall_single_hit = min(num_retrieved_relevants, 1)
|
recall_single_hit = min(num_retrieved_relevants, 1)
|
||||||
precision = num_retrieved_relevants / retrieved if retrieved > 0 else 0.0
|
precision = num_retrieved_relevants / retrieved if retrieved > 0 else 0.0
|
||||||
rr = 1.0 / rank_retrieved_relevants.min() if len(rank_retrieved_relevants) > 0 else 0.0
|
rr = 1.0 / rank_retrieved_relevants.min() if len(rank_retrieved_relevants) > 0 else 0.0
|
||||||
|
dcg = np.sum([1.0 / np.log2(rank+1) for rank in rank_retrieved_relevants]) if len(rank_retrieved_relevants) > 0 else 0.0
|
||||||
|
idcg = np.sum([1.0 / np.log2(rank+1) for rank in range(1, num_relevants+1)]) if num_relevants > 0 else 1.0
|
||||||
|
ndcg = dcg / idcg
|
||||||
|
|
||||||
metrics.append({
|
metrics.append({
|
||||||
"recall_multi_hit": recall_multi_hit,
|
"recall_multi_hit": recall_multi_hit,
|
||||||
"recall_single_hit": recall_single_hit,
|
"recall_single_hit": recall_single_hit,
|
||||||
"precision": precision,
|
"precision": precision,
|
||||||
"map": avg_precision,
|
"map": avg_precision,
|
||||||
"mrr": rr
|
"mrr": rr,
|
||||||
|
"ndcg": ndcg
|
||||||
})
|
})
|
||||||
|
|
||||||
metrics_df = pd.DataFrame.from_records(metrics, index=queries)
|
metrics_df = pd.DataFrame.from_records(metrics, index=queries)
|
||||||
|
|||||||
@ -37,6 +37,7 @@ def test_generativeqa_calculate_metrics(document_store_with_docs: InMemoryDocume
|
|||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["precision"] == 1.0/6
|
assert metrics["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
assert metrics["Generator"]["exact_match"] == 0.0
|
assert metrics["Generator"]["exact_match"] == 0.0
|
||||||
assert metrics["Generator"]["f1"] == 1.0/3
|
assert metrics["Generator"]["f1"] == 1.0/3
|
||||||
|
|
||||||
@ -62,11 +63,13 @@ def test_summarizer_calculate_metrics(document_store_with_docs: ElasticsearchDoc
|
|||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["precision"] == 1.0/6
|
assert metrics["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
assert metrics["Summarizer"]["mrr"] == 0.5
|
assert metrics["Summarizer"]["mrr"] == 0.5
|
||||||
assert metrics["Summarizer"]["map"] == 0.5
|
assert metrics["Summarizer"]["map"] == 0.5
|
||||||
assert metrics["Summarizer"]["recall_multi_hit"] == 0.5
|
assert metrics["Summarizer"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["Summarizer"]["recall_single_hit"] == 0.5
|
assert metrics["Summarizer"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["Summarizer"]["precision"] == 1.0/6
|
assert metrics["Summarizer"]["precision"] == 1.0/6
|
||||||
|
assert metrics["Summarizer"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True)
|
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True)
|
||||||
@ -286,6 +289,7 @@ def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path):
|
|||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 1.0/3
|
assert metrics["Retriever"]["precision"] == 1.0/3
|
||||||
assert metrics["Retriever"]["map"] == 1.0
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
eval_result.save(tmp_path)
|
eval_result.save(tmp_path)
|
||||||
saved_eval_result = EvaluationResult.load(tmp_path)
|
saved_eval_result = EvaluationResult.load(tmp_path)
|
||||||
@ -300,6 +304,7 @@ def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path):
|
|||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 1.0/3
|
assert metrics["Retriever"]["precision"] == 1.0/3
|
||||||
assert metrics["Retriever"]["map"] == 1.0
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
@ -333,6 +338,7 @@ def test_extractive_qa_eval_multiple_queries(reader, retriever_with_docs, tmp_pa
|
|||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["precision"] == 1.0/6
|
assert metrics["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
eval_result.save(tmp_path)
|
eval_result.save(tmp_path)
|
||||||
saved_eval_result = EvaluationResult.load(tmp_path)
|
saved_eval_result = EvaluationResult.load(tmp_path)
|
||||||
@ -349,6 +355,7 @@ def test_extractive_qa_eval_multiple_queries(reader, retriever_with_docs, tmp_pa
|
|||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["precision"] == 1.0/6
|
assert metrics["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
@ -370,6 +377,7 @@ def test_extractive_qa_eval_sas(reader, retriever_with_docs):
|
|||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["precision"] == 1.0/6
|
assert metrics["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
assert "sas" in metrics["Reader"]
|
assert "sas" in metrics["Reader"]
|
||||||
assert metrics["Reader"]["sas"] == pytest.approx(1.0)
|
assert metrics["Reader"]["sas"] == pytest.approx(1.0)
|
||||||
|
|
||||||
@ -390,6 +398,7 @@ def test_extractive_qa_eval_doc_relevance_col(reader, retriever_with_docs):
|
|||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.75
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.75
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 1.0/3
|
assert metrics["Retriever"]["precision"] == 1.0/3
|
||||||
|
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
@ -412,6 +421,7 @@ def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs):
|
|||||||
assert metrics_top_1["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics_top_1["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics_top_1["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics_top_1["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics_top_1["Retriever"]["precision"] == 1.0/6
|
assert metrics_top_1["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics_top_1["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
metrics_top_2 = eval_result.calculate_metrics(simulated_top_k_reader=2)
|
metrics_top_2 = eval_result.calculate_metrics(simulated_top_k_reader=2)
|
||||||
|
|
||||||
@ -423,6 +433,7 @@ def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs):
|
|||||||
assert metrics_top_2["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics_top_2["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics_top_2["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics_top_2["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics_top_2["Retriever"]["precision"] == 1.0/6
|
assert metrics_top_2["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics_top_2["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
metrics_top_3 = eval_result.calculate_metrics(simulated_top_k_reader=3)
|
metrics_top_3 = eval_result.calculate_metrics(simulated_top_k_reader=3)
|
||||||
|
|
||||||
@ -434,6 +445,7 @@ def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs):
|
|||||||
assert metrics_top_3["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics_top_3["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics_top_3["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics_top_3["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics_top_3["Retriever"]["precision"] == 1.0/6
|
assert metrics_top_3["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics_top_3["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
@ -454,6 +466,7 @@ def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_doc
|
|||||||
assert metrics_top_10["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics_top_10["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics_top_10["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics_top_10["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics_top_10["Retriever"]["precision"] == 1.0/6
|
assert metrics_top_10["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics_top_10["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
metrics_top_1 = eval_result.calculate_metrics(simulated_top_k_retriever=1)
|
metrics_top_1 = eval_result.calculate_metrics(simulated_top_k_retriever=1)
|
||||||
|
|
||||||
@ -464,6 +477,7 @@ def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_doc
|
|||||||
assert metrics_top_1["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics_top_1["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics_top_1["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics_top_1["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics_top_1["Retriever"]["precision"] == 0.5
|
assert metrics_top_1["Retriever"]["precision"] == 0.5
|
||||||
|
assert metrics_top_1["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
metrics_top_2 = eval_result.calculate_metrics(simulated_top_k_retriever=2)
|
metrics_top_2 = eval_result.calculate_metrics(simulated_top_k_retriever=2)
|
||||||
|
|
||||||
@ -474,6 +488,7 @@ def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_doc
|
|||||||
assert metrics_top_2["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics_top_2["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics_top_2["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics_top_2["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics_top_2["Retriever"]["precision"] == 0.25
|
assert metrics_top_2["Retriever"]["precision"] == 0.25
|
||||||
|
assert metrics_top_2["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
metrics_top_3 = eval_result.calculate_metrics(simulated_top_k_retriever=3)
|
metrics_top_3 = eval_result.calculate_metrics(simulated_top_k_retriever=3)
|
||||||
|
|
||||||
@ -484,6 +499,7 @@ def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_doc
|
|||||||
assert metrics_top_3["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics_top_3["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics_top_3["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics_top_3["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics_top_3["Retriever"]["precision"] == 1.0/6
|
assert metrics_top_3["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics_top_3["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
@ -504,6 +520,7 @@ def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriev
|
|||||||
assert metrics_top_10["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics_top_10["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics_top_10["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics_top_10["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics_top_10["Retriever"]["precision"] == 1.0/6
|
assert metrics_top_10["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics_top_10["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
metrics_top_1 = eval_result.calculate_metrics(simulated_top_k_reader=1, simulated_top_k_retriever=1)
|
metrics_top_1 = eval_result.calculate_metrics(simulated_top_k_reader=1, simulated_top_k_retriever=1)
|
||||||
|
|
||||||
@ -514,6 +531,7 @@ def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriev
|
|||||||
assert metrics_top_1["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics_top_1["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics_top_1["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics_top_1["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics_top_1["Retriever"]["precision"] == 0.5
|
assert metrics_top_1["Retriever"]["precision"] == 0.5
|
||||||
|
assert metrics_top_1["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
metrics_top_2 = eval_result.calculate_metrics(simulated_top_k_reader=1, simulated_top_k_retriever=2)
|
metrics_top_2 = eval_result.calculate_metrics(simulated_top_k_reader=1, simulated_top_k_retriever=2)
|
||||||
|
|
||||||
@ -524,6 +542,7 @@ def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriev
|
|||||||
assert metrics_top_2["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics_top_2["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics_top_2["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics_top_2["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics_top_2["Retriever"]["precision"] == 0.25
|
assert metrics_top_2["Retriever"]["precision"] == 0.25
|
||||||
|
assert metrics_top_2["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
metrics_top_3 = eval_result.calculate_metrics(simulated_top_k_reader=1, simulated_top_k_retriever=3)
|
metrics_top_3 = eval_result.calculate_metrics(simulated_top_k_reader=1, simulated_top_k_retriever=3)
|
||||||
|
|
||||||
@ -534,7 +553,8 @@ def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriev
|
|||||||
assert metrics_top_3["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics_top_3["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics_top_3["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics_top_3["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics_top_3["Retriever"]["precision"] == 1.0/6
|
assert metrics_top_3["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics_top_3["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
|
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
|
||||||
@ -556,6 +576,7 @@ def test_extractive_qa_eval_isolated(reader, retriever_with_docs):
|
|||||||
assert metrics_top_1["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics_top_1["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics_top_1["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics_top_1["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics_top_1["Retriever"]["precision"] == 1.0 / 6
|
assert metrics_top_1["Retriever"]["precision"] == 1.0 / 6
|
||||||
|
assert metrics_top_1["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
metrics_top_1 = eval_result.calculate_metrics(simulated_top_k_reader=1, eval_mode="isolated")
|
metrics_top_1 = eval_result.calculate_metrics(simulated_top_k_reader=1, eval_mode="isolated")
|
||||||
|
|
||||||
@ -643,6 +664,7 @@ def test_document_search_calculate_metrics(retriever_with_docs):
|
|||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["precision"] == 1.0/6
|
assert metrics["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
@ -665,6 +687,7 @@ def test_faq_calculate_metrics(retriever_with_docs):
|
|||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["precision"] == 1.0/6
|
assert metrics["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
assert metrics["Docs2Answers"]["exact_match"] == 0.0
|
assert metrics["Docs2Answers"]["exact_match"] == 0.0
|
||||||
assert metrics["Docs2Answers"]["f1"] == 0.0
|
assert metrics["Docs2Answers"]["f1"] == 0.0
|
||||||
|
|
||||||
@ -693,6 +716,7 @@ def test_extractive_qa_eval_translation(reader, retriever_with_docs, de_to_en_tr
|
|||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["precision"] == 1.0/6
|
assert metrics["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
assert metrics["OutputTranslator"]["exact_match"] == 1.0
|
assert metrics["OutputTranslator"]["exact_match"] == 1.0
|
||||||
assert metrics["OutputTranslator"]["f1"] == 1.0
|
assert metrics["OutputTranslator"]["f1"] == 1.0
|
||||||
@ -701,6 +725,7 @@ def test_extractive_qa_eval_translation(reader, retriever_with_docs, de_to_en_tr
|
|||||||
assert metrics["OutputTranslator"]["recall_multi_hit"] == 0.5
|
assert metrics["OutputTranslator"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["OutputTranslator"]["recall_single_hit"] == 0.5
|
assert metrics["OutputTranslator"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["OutputTranslator"]["precision"] == 1.0/6
|
assert metrics["OutputTranslator"]["precision"] == 1.0/6
|
||||||
|
assert metrics["OutputTranslator"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
@ -724,12 +749,14 @@ def test_question_generation_eval(retriever_with_docs, question_generator):
|
|||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["Retriever"]["precision"] == 1.0/6
|
assert metrics["Retriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
assert metrics["Question Generator"]["mrr"] == 0.5
|
assert metrics["Question Generator"]["mrr"] == 0.5
|
||||||
assert metrics["Question Generator"]["map"] == 0.5
|
assert metrics["Question Generator"]["map"] == 0.5
|
||||||
assert metrics["Question Generator"]["recall_multi_hit"] == 0.5
|
assert metrics["Question Generator"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["Question Generator"]["recall_single_hit"] == 0.5
|
assert metrics["Question Generator"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["Question Generator"]["precision"] == 1.0/6
|
assert metrics["Question Generator"]["precision"] == 1.0/6
|
||||||
|
assert metrics["Question Generator"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
||||||
@ -771,12 +798,14 @@ def test_qa_multi_retriever_pipeline_eval(document_store_with_docs, reader):
|
|||||||
assert metrics["DPRRetriever"]["recall_multi_hit"] == 0.5
|
assert metrics["DPRRetriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["DPRRetriever"]["recall_single_hit"] == 0.5
|
assert metrics["DPRRetriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["DPRRetriever"]["precision"] == 1.0/6
|
assert metrics["DPRRetriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics["DPRRetriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
assert metrics["ESRetriever"]["mrr"] == 1.0
|
assert metrics["ESRetriever"]["mrr"] == 1.0
|
||||||
assert metrics["ESRetriever"]["map"] == 1.0
|
assert metrics["ESRetriever"]["map"] == 1.0
|
||||||
assert metrics["ESRetriever"]["recall_multi_hit"] == 1.0
|
assert metrics["ESRetriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["ESRetriever"]["recall_single_hit"] == 1.0
|
assert metrics["ESRetriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["ESRetriever"]["precision"] == 1.0/3
|
assert metrics["ESRetriever"]["precision"] == 1.0/3
|
||||||
|
assert metrics["ESRetriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
assert metrics["QAReader"]["exact_match"] == 1.0
|
assert metrics["QAReader"]["exact_match"] == 1.0
|
||||||
assert metrics["QAReader"]["f1"] == 1.0
|
assert metrics["QAReader"]["f1"] == 1.0
|
||||||
@ -819,12 +848,14 @@ def test_multi_retriever_pipeline_eval(document_store_with_docs, reader):
|
|||||||
assert metrics["DPRRetriever"]["recall_multi_hit"] == 0.5
|
assert metrics["DPRRetriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["DPRRetriever"]["recall_single_hit"] == 0.5
|
assert metrics["DPRRetriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["DPRRetriever"]["precision"] == 1.0/6
|
assert metrics["DPRRetriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics["DPRRetriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
assert metrics["ESRetriever"]["mrr"] == 1.0
|
assert metrics["ESRetriever"]["mrr"] == 1.0
|
||||||
assert metrics["ESRetriever"]["map"] == 1.0
|
assert metrics["ESRetriever"]["map"] == 1.0
|
||||||
assert metrics["ESRetriever"]["recall_multi_hit"] == 1.0
|
assert metrics["ESRetriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["ESRetriever"]["recall_single_hit"] == 1.0
|
assert metrics["ESRetriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["ESRetriever"]["precision"] == 1.0/3
|
assert metrics["ESRetriever"]["precision"] == 1.0/3
|
||||||
|
assert metrics["ESRetriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
||||||
@ -867,12 +898,14 @@ def test_multi_retriever_pipeline_with_asymmetric_qa_eval(document_store_with_do
|
|||||||
assert metrics["DPRRetriever"]["recall_multi_hit"] == 0.5
|
assert metrics["DPRRetriever"]["recall_multi_hit"] == 0.5
|
||||||
assert metrics["DPRRetriever"]["recall_single_hit"] == 0.5
|
assert metrics["DPRRetriever"]["recall_single_hit"] == 0.5
|
||||||
assert metrics["DPRRetriever"]["precision"] == 1.0/6
|
assert metrics["DPRRetriever"]["precision"] == 1.0/6
|
||||||
|
assert metrics["DPRRetriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
assert metrics["ESRetriever"]["mrr"] == 1.0
|
assert metrics["ESRetriever"]["mrr"] == 1.0
|
||||||
assert metrics["ESRetriever"]["map"] == 1.0
|
assert metrics["ESRetriever"]["map"] == 1.0
|
||||||
assert metrics["ESRetriever"]["recall_multi_hit"] == 1.0
|
assert metrics["ESRetriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["ESRetriever"]["recall_single_hit"] == 1.0
|
assert metrics["ESRetriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["ESRetriever"]["precision"] == 1.0/3
|
assert metrics["ESRetriever"]["precision"] == 1.0/3
|
||||||
|
assert metrics["ESRetriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
assert metrics["QAReader"]["exact_match"] == 1.0
|
assert metrics["QAReader"]["exact_match"] == 1.0
|
||||||
assert metrics["QAReader"]["f1"] == 1.0
|
assert metrics["QAReader"]["f1"] == 1.0
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user