mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-20 23:41:36 +00:00
fix: document retrieval metrics for non-document_id document_relevance_criteria (#3885)
* fix document retrieval metrics for all document_relevance_criteria * fix tests * fix eval_batch metrics * small refactorings * evaluate metrics on label level * document retrieval tests added * fix pylint * fix test * support file retrieval * add comment about threshold * rename test
This commit is contained in:
parent
e62d24d0eb
commit
9611b64ec5
@ -1413,7 +1413,7 @@ class Pipeline:
|
|||||||
"multilabel_id", # generic
|
"multilabel_id", # generic
|
||||||
"query", # generic
|
"query", # generic
|
||||||
"filters", # generic
|
"filters", # generic
|
||||||
"gold_answers", # answer-specific
|
"gold_answers", # generic
|
||||||
"answer", # answer-specific
|
"answer", # answer-specific
|
||||||
"context", # generic
|
"context", # generic
|
||||||
"exact_match", # answer-specific
|
"exact_match", # answer-specific
|
||||||
@ -1690,6 +1690,7 @@ class Pipeline:
|
|||||||
df_docs.map_rows = partial(df_docs.apply, axis=1)
|
df_docs.map_rows = partial(df_docs.apply, axis=1)
|
||||||
df_docs.rename(columns={"id": "document_id", "content": "context"}, inplace=True)
|
df_docs.rename(columns={"id": "document_id", "content": "context"}, inplace=True)
|
||||||
df_docs["gold_document_ids"] = [gold_document_ids] * len(df_docs)
|
df_docs["gold_document_ids"] = [gold_document_ids] * len(df_docs)
|
||||||
|
df_docs["gold_answers"] = [gold_answers] * len(df_docs)
|
||||||
df_docs["gold_contexts"] = [gold_contexts] * len(df_docs)
|
df_docs["gold_contexts"] = [gold_contexts] * len(df_docs)
|
||||||
df_docs["gold_contexts_similarity"] = df_docs.map_rows(
|
df_docs["gold_contexts_similarity"] = df_docs.map_rows(
|
||||||
lambda row: [
|
lambda row: [
|
||||||
@ -1740,7 +1741,12 @@ class Pipeline:
|
|||||||
|
|
||||||
# document_relevance_criterion: "document_id_and_answer",
|
# document_relevance_criterion: "document_id_and_answer",
|
||||||
df_docs["gold_id_and_answer_match"] = df_docs.map_rows(
|
df_docs["gold_id_and_answer_match"] = df_docs.map_rows(
|
||||||
lambda row: min(row["gold_id_match"], row["answer_match"])
|
lambda row: max(
|
||||||
|
min(id_match, answer_match)
|
||||||
|
for id_match, answer_match in zip(
|
||||||
|
row["gold_documents_id_match"] + [0.0], row["gold_answers_match"] + [0.0]
|
||||||
|
)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# document_relevance_criterion: "context",
|
# document_relevance_criterion: "context",
|
||||||
@ -1757,17 +1763,36 @@ class Pipeline:
|
|||||||
|
|
||||||
# document_relevance_criterion: "document_id_and_context",
|
# document_relevance_criterion: "document_id_and_context",
|
||||||
df_docs["gold_id_and_context_match"] = df_docs.map_rows(
|
df_docs["gold_id_and_context_match"] = df_docs.map_rows(
|
||||||
lambda row: min(row["gold_id_match"], row["context_match"])
|
lambda row: max(
|
||||||
|
min(id_match, 1.0 if context_similarity > context_matching_threshold else 0.0)
|
||||||
|
for id_match, context_similarity in zip(
|
||||||
|
row["gold_documents_id_match"] + [0.0], row["gold_contexts_similarity"] + [0.0]
|
||||||
|
)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# document_relevance_criterion: "document_id_and_context_and_answer",
|
# document_relevance_criterion: "document_id_and_context_and_answer",
|
||||||
df_docs["gold_id_and_context_and_answer_match"] = df_docs.map_rows(
|
df_docs["gold_id_and_context_and_answer_match"] = df_docs.map_rows(
|
||||||
lambda row: min(row["gold_id_match"], row["context_match"], row["answer_match"])
|
lambda row: max(
|
||||||
|
min(id_match, 1.0 if context_similarity > context_matching_threshold else 0.0, answer_match)
|
||||||
|
for id_match, context_similarity, answer_match in zip(
|
||||||
|
row["gold_documents_id_match"] + [0.0],
|
||||||
|
row["gold_contexts_similarity"] + [0.0],
|
||||||
|
row["gold_answers_match"] + [0.0],
|
||||||
|
)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# document_relevance_criterion: "context_and_answer",
|
# document_relevance_criterion: "context_and_answer",
|
||||||
df_docs["context_and_answer_match"] = df_docs.map_rows(
|
df_docs["context_and_answer_match"] = df_docs.map_rows(
|
||||||
lambda row: min(row["context_match"], row["answer_match"])
|
lambda row: max(
|
||||||
|
min(1.0 if context_similarity > context_matching_threshold else 0.0, answer_match)
|
||||||
|
for context_similarity, answer_match in zip(
|
||||||
|
row["gold_contexts_similarity"], row["gold_answers_match"]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if any(row["gold_answers_match"]) and any(row["gold_contexts_similarity"])
|
||||||
|
else 0.0
|
||||||
)
|
)
|
||||||
|
|
||||||
df_docs["rank"] = np.arange(1, len(df_docs) + 1)
|
df_docs["rank"] = np.arange(1, len(df_docs) + 1)
|
||||||
|
@ -1436,35 +1436,102 @@ class EvaluationResult:
|
|||||||
if simulated_top_k_retriever != -1:
|
if simulated_top_k_retriever != -1:
|
||||||
documents = documents[documents["rank"] <= simulated_top_k_retriever]
|
documents = documents[documents["rank"] <= simulated_top_k_retriever]
|
||||||
|
|
||||||
|
# find out which label matched
|
||||||
|
def find_matched_label_idxs(row) -> List[int]: # pylint: disable=too-many-return-statements
|
||||||
|
id_matches = [idx for idx, val in enumerate(row["gold_documents_id_match"]) if val == 1.0]
|
||||||
|
context_matches = [
|
||||||
|
idx for idx, val in enumerate(row["gold_contexts_similarity"]) if val > 65.0
|
||||||
|
] # TODO: hardcoded threshold for now, will be param of calculate_metrics
|
||||||
|
answer_matches = [idx for idx, val in enumerate(row["gold_answers_match"]) if val == 1.0]
|
||||||
|
if document_relevance_criterion == "document_id":
|
||||||
|
return id_matches
|
||||||
|
elif document_relevance_criterion == "context":
|
||||||
|
return context_matches
|
||||||
|
elif document_relevance_criterion == "answer":
|
||||||
|
return answer_matches
|
||||||
|
elif document_relevance_criterion == "document_id_and_context":
|
||||||
|
return list(set(id_matches) & set(context_matches))
|
||||||
|
elif document_relevance_criterion == "document_id_or_context":
|
||||||
|
return list(set(id_matches) | set(context_matches))
|
||||||
|
elif document_relevance_criterion == "document_id_and_answer":
|
||||||
|
return list(set(id_matches) & set(answer_matches))
|
||||||
|
elif document_relevance_criterion == "document_id_or_answer":
|
||||||
|
return list(set(id_matches) | set(answer_matches))
|
||||||
|
elif document_relevance_criterion == "context_and_answer":
|
||||||
|
return list(set(context_matches) & set(answer_matches))
|
||||||
|
elif document_relevance_criterion == "document_id_and_context_and_answer":
|
||||||
|
return list(set(id_matches) & set(context_matches) & set(answer_matches))
|
||||||
|
else:
|
||||||
|
raise ValueError(f"document_relevance_criterion '{document_relevance_criterion}' not supported.")
|
||||||
|
|
||||||
|
documents["matched_label_idxs"] = documents.apply(find_matched_label_idxs, axis=1)
|
||||||
|
|
||||||
metrics = []
|
metrics = []
|
||||||
|
|
||||||
for multilabel_id in documents["multilabel_id"].unique():
|
for multilabel_id in documents["multilabel_id"].unique():
|
||||||
query_df = documents[documents["multilabel_id"] == multilabel_id]
|
query_df = documents[documents["multilabel_id"] == multilabel_id]
|
||||||
gold_ids = list(query_df["gold_document_ids"].iloc[0])
|
|
||||||
retrieved = len(query_df)
|
|
||||||
|
|
||||||
|
# Note: Metrics are always calculated on document_ids.
|
||||||
|
# For some document relevance criteria (e.g. context), the gold_document_ids are not enough or not useful at all.
|
||||||
|
# So, we have to adjust the relevant ids according to the document_relevance_criterion.
|
||||||
relevance_criterion_col = f"{document_relevance_criterion.replace('document_id', 'gold_id')}_match"
|
relevance_criterion_col = f"{document_relevance_criterion.replace('document_id', 'gold_id')}_match"
|
||||||
relevance_criterion_ids = list(query_df[query_df[relevance_criterion_col] == 1]["document_id"].values)
|
relevant_rows = query_df[query_df[relevance_criterion_col] == 1]
|
||||||
num_relevants = len(set(gold_ids + relevance_criterion_ids))
|
|
||||||
num_retrieved_relevants = query_df[relevance_criterion_col].values.sum()
|
|
||||||
rank_retrieved_relevants = query_df[query_df[relevance_criterion_col] == 1]["rank"].values
|
|
||||||
avp_retrieved_relevants = [
|
|
||||||
query_df[relevance_criterion_col].values[: int(rank)].sum() / rank for rank in rank_retrieved_relevants
|
|
||||||
]
|
|
||||||
|
|
||||||
avg_precision = np.sum(avp_retrieved_relevants) / num_relevants if num_relevants > 0 else 0.0
|
# all labels without no_answers
|
||||||
recall_multi_hit = num_retrieved_relevants / num_relevants if num_relevants > 0 else 1.0
|
# we need to match all (except for single hit recall)
|
||||||
recall_single_hit = min(num_retrieved_relevants, 1) if num_relevants > 0 else 1.0
|
gold_document_ids = (
|
||||||
precision = num_retrieved_relevants / retrieved if retrieved > 0 else 0.0
|
list(query_df["gold_custom_document_ids"].iloc[0])
|
||||||
rr = 1.0 / rank_retrieved_relevants.min() if len(rank_retrieved_relevants) > 0 else 0.0
|
if "gold_custom_document_ids" in query_df
|
||||||
dcg = (
|
else list(query_df["gold_document_ids"].iloc[0])
|
||||||
np.sum([1.0 / np.log2(rank + 1) for rank in rank_retrieved_relevants])
|
|
||||||
if len(rank_retrieved_relevants) > 0
|
|
||||||
else 0.0
|
|
||||||
)
|
|
||||||
idcg = (
|
|
||||||
np.sum([1.0 / np.log2(rank + 1) for rank in range(1, num_relevants + 1)]) if num_relevants > 0 else 1.0
|
|
||||||
)
|
)
|
||||||
|
# remove no_answer label
|
||||||
|
gold_document_ids = [id for id in gold_document_ids if id != "00"]
|
||||||
|
|
||||||
|
num_labels = len(gold_document_ids)
|
||||||
|
num_matched_labels = len(set(idx for idxs in relevant_rows["matched_label_idxs"] for idx in idxs))
|
||||||
|
num_missing_labels = num_labels - num_matched_labels
|
||||||
|
|
||||||
|
relevance_criterion_ids = list(relevant_rows["document_id"].values)
|
||||||
|
num_relevants = len(set(relevance_criterion_ids)) + num_missing_labels
|
||||||
|
|
||||||
|
num_retrieved = len(query_df["document_id"])
|
||||||
|
num_retrieved_relevants = len(relevant_rows)
|
||||||
|
rank_retrieved_relevants = relevant_rows["rank"].values
|
||||||
|
|
||||||
|
if num_labels == 0:
|
||||||
|
# For no_answer queries, we set all metrics to 1.0, to indicate that the retriever cannot improve the pipeline.
|
||||||
|
# This behavior is different from pytrec_eval, which sets the metrics to 0.0 if there is no relevant document in the evalset.
|
||||||
|
rr = 1.0
|
||||||
|
avg_precision = 1.0
|
||||||
|
recall_multi_hit = 1.0
|
||||||
|
recall_single_hit = 1.0
|
||||||
|
precision = 1.0
|
||||||
|
ndcg = 1.0
|
||||||
|
elif num_retrieved_relevants == 0:
|
||||||
|
# Set all metrics to 0.0 if no relevant document has been retrieved to avoid undefined metrics.
|
||||||
|
rr = 0.0
|
||||||
|
avg_precision = 0.0
|
||||||
|
recall_multi_hit = 0.0
|
||||||
|
recall_single_hit = 0.0
|
||||||
|
precision = 0.0
|
||||||
|
ndcg = 0.0
|
||||||
|
else:
|
||||||
|
# The previous checks ensure:
|
||||||
|
# - `num_labels` > 0
|
||||||
|
# - `num_retrieved_relevants` > 0
|
||||||
|
# - `num_relevants` > 0 (`num_relevants` is always >= `num_labels`)
|
||||||
|
# - `num_retrieved` > 0 (`num_retrieved` is always >= `num_retrieved_relevants`)
|
||||||
|
# - `len(rank_retrieved_relevants)` > 0 (`len(rank_retrieved_relevants)` is always == `num_retrieved_relevants`)
|
||||||
|
avp_retrieved_relevants = [
|
||||||
|
len(relevant_rows[relevant_rows["rank"] <= rank]) / rank for rank in rank_retrieved_relevants
|
||||||
|
]
|
||||||
|
avg_precision = np.sum(avp_retrieved_relevants) / num_relevants
|
||||||
|
recall_multi_hit = num_matched_labels / num_labels
|
||||||
|
recall_single_hit = 1.0
|
||||||
|
precision = num_retrieved_relevants / num_retrieved
|
||||||
|
rr = 1.0 / rank_retrieved_relevants.min()
|
||||||
|
dcg = np.sum([1.0 / np.log2(rank + 1) for rank in rank_retrieved_relevants])
|
||||||
|
idcg = np.sum([1.0 / np.log2(rank + 1) for rank in range(1, num_relevants + 1)])
|
||||||
ndcg = dcg / idcg
|
ndcg = dcg / idcg
|
||||||
|
|
||||||
metrics.append(
|
metrics.append(
|
||||||
|
@ -69,17 +69,17 @@ def test_summarizer_calculate_metrics(document_store_with_docs: ElasticsearchDoc
|
|||||||
assert len(eval_result) == 2
|
assert len(eval_result) == 2
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 1.0
|
assert metrics["Retriever"]["precision"] == 1.0
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
assert metrics["Summarizer"]["mrr"] == 1.0
|
assert metrics["Summarizer"]["mrr"] == 1.0
|
||||||
assert metrics["Summarizer"]["map"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Summarizer"]["map"] == 1.0
|
||||||
assert metrics["Summarizer"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Summarizer"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Summarizer"]["recall_single_hit"] == 1.0
|
assert metrics["Summarizer"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Summarizer"]["precision"] == 1.0
|
assert metrics["Summarizer"]["precision"] == 1.0
|
||||||
assert metrics["Summarizer"]["ndcg"] == pytest.approx(0.9461, 1e-4)
|
assert metrics["Summarizer"]["ndcg"] == 1.0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True)
|
@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True)
|
||||||
@ -304,6 +304,108 @@ EVAL_LABELS = [
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
NO_ANSWER_EVAL_LABELS = [
|
||||||
|
MultiLabel(
|
||||||
|
labels=[
|
||||||
|
Label(
|
||||||
|
query="Why does probability work?",
|
||||||
|
document=Document(""),
|
||||||
|
answer=None,
|
||||||
|
is_correct_answer=True,
|
||||||
|
is_correct_document=True,
|
||||||
|
origin="gold-label",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
DOC_SEARCH_EVAL_LABELS = [
|
||||||
|
MultiLabel(
|
||||||
|
labels=[
|
||||||
|
Label(
|
||||||
|
query="Who lives in Berlin?",
|
||||||
|
answer=None,
|
||||||
|
document=Document(
|
||||||
|
id="a0747b83aea0b60c4b114b15476dd32d",
|
||||||
|
content_type="text",
|
||||||
|
content="My name is Carla and I live in Berlin",
|
||||||
|
),
|
||||||
|
is_correct_answer=False,
|
||||||
|
is_correct_document=True,
|
||||||
|
origin="gold-label",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
),
|
||||||
|
MultiLabel(
|
||||||
|
labels=[
|
||||||
|
Label(
|
||||||
|
query="Who lives in Munich?",
|
||||||
|
answer=None,
|
||||||
|
document=Document(
|
||||||
|
id="something_else", content_type="text", content="My name is Carla and I live in Munich"
|
||||||
|
),
|
||||||
|
is_correct_answer=False,
|
||||||
|
is_correct_document=True,
|
||||||
|
origin="gold-label",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
DOC_SEARCH_ID_EVAL_LABELS = [
|
||||||
|
MultiLabel(
|
||||||
|
labels=[
|
||||||
|
Label(
|
||||||
|
query="Who lives in Berlin?",
|
||||||
|
answer=None,
|
||||||
|
document=Document(id="a0747b83aea0b60c4b114b15476dd32d", content_type="text", content=""),
|
||||||
|
is_correct_answer=False,
|
||||||
|
is_correct_document=True,
|
||||||
|
origin="gold-label",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
),
|
||||||
|
MultiLabel(
|
||||||
|
labels=[
|
||||||
|
Label(
|
||||||
|
query="Who lives in Munich?",
|
||||||
|
answer=None,
|
||||||
|
document=Document(id="something_else", content_type="text", content=""),
|
||||||
|
is_correct_answer=False,
|
||||||
|
is_correct_document=True,
|
||||||
|
origin="gold-label",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
FILE_SEARCH_EVAL_LABELS = [
|
||||||
|
MultiLabel(
|
||||||
|
labels=[
|
||||||
|
Label(
|
||||||
|
query="Who lives in Berlin?",
|
||||||
|
answer=None,
|
||||||
|
document=Document(content_type="text", content="", meta={"name": "filename1"}),
|
||||||
|
is_correct_answer=False,
|
||||||
|
is_correct_document=True,
|
||||||
|
origin="gold-label",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
),
|
||||||
|
MultiLabel(
|
||||||
|
labels=[
|
||||||
|
Label(
|
||||||
|
query="Who lives in Munich?",
|
||||||
|
answer=None,
|
||||||
|
document=Document(content_type="text", content="", meta={"name": "filename2"}),
|
||||||
|
is_correct_answer=False,
|
||||||
|
is_correct_document=True,
|
||||||
|
origin="gold-label",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
|
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
|
||||||
@ -320,7 +422,6 @@ def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path):
|
|||||||
retriever_result = eval_result["Retriever"]
|
retriever_result = eval_result["Retriever"]
|
||||||
|
|
||||||
expected_reader_result_columns = [
|
expected_reader_result_columns = [
|
||||||
"gold_answers", # answer-specific
|
|
||||||
"answer", # answer-specific
|
"answer", # answer-specific
|
||||||
"exact_match", # answer-specific
|
"exact_match", # answer-specific
|
||||||
"f1", # answer-specific
|
"f1", # answer-specific
|
||||||
@ -370,6 +471,7 @@ def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path):
|
|||||||
"rank", # generic
|
"rank", # generic
|
||||||
"document_id", # generic
|
"document_id", # generic
|
||||||
"gold_document_ids", # generic
|
"gold_document_ids", # generic
|
||||||
|
"gold_answers", # generic
|
||||||
# "custom_document_id", # generic optional
|
# "custom_document_id", # generic optional
|
||||||
# "gold_custom_document_ids", # generic optional
|
# "gold_custom_document_ids", # generic optional
|
||||||
]
|
]
|
||||||
@ -622,9 +724,8 @@ def test_reader_eval_in_pipeline(reader):
|
|||||||
|
|
||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
|
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
|
||||||
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
|
def test_extractive_qa_eval_document_scope(retriever_with_docs):
|
||||||
def test_extractive_qa_eval_document_scope(reader, retriever_with_docs):
|
pipeline = DocumentSearchPipeline(retriever=retriever_with_docs)
|
||||||
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
|
|
||||||
eval_result: EvaluationResult = pipeline.eval(
|
eval_result: EvaluationResult = pipeline.eval(
|
||||||
labels=EVAL_LABELS,
|
labels=EVAL_LABELS,
|
||||||
params={"Retriever": {"top_k": 5}},
|
params={"Retriever": {"top_k": 5}},
|
||||||
@ -643,11 +744,11 @@ def test_extractive_qa_eval_document_scope(reader, retriever_with_docs):
|
|||||||
metrics = eval_result.calculate_metrics(document_scope="context")
|
metrics = eval_result.calculate_metrics(document_scope="context")
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 1.0
|
assert metrics["Retriever"]["precision"] == 1.0
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
metrics = eval_result.calculate_metrics(document_scope="document_id_and_context")
|
metrics = eval_result.calculate_metrics(document_scope="document_id_and_context")
|
||||||
|
|
||||||
@ -661,29 +762,249 @@ def test_extractive_qa_eval_document_scope(reader, retriever_with_docs):
|
|||||||
metrics = eval_result.calculate_metrics(document_scope="document_id_or_context")
|
metrics = eval_result.calculate_metrics(document_scope="document_id_or_context")
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 1.0
|
assert metrics["Retriever"]["precision"] == 1.0
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
metrics = eval_result.calculate_metrics(document_scope="answer")
|
metrics = eval_result.calculate_metrics(document_scope="answer")
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == 0.75
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.75
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 0.2
|
assert metrics["Retriever"]["precision"] == 0.2
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer")
|
metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer")
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == 0.75
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.75
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 0.2
|
assert metrics["Retriever"]["precision"] == 0.2
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
|
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
|
||||||
|
def test_document_search_eval_document_scope(retriever_with_docs):
|
||||||
|
pipeline = DocumentSearchPipeline(retriever=retriever_with_docs)
|
||||||
|
eval_result: EvaluationResult = pipeline.eval(
|
||||||
|
labels=DOC_SEARCH_EVAL_LABELS,
|
||||||
|
params={"Retriever": {"top_k": 5}},
|
||||||
|
context_matching_min_length=20, # artificially set down min_length to see if context matching is working properly
|
||||||
|
)
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="document_id")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.5
|
||||||
|
assert metrics["Retriever"]["map"] == 0.5
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.1
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="context")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
|
assert metrics["Retriever"]["precision"] == 1.0
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="document_id_and_context")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.5
|
||||||
|
assert metrics["Retriever"]["map"] == 0.5
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.1
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="document_id_or_context")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
|
assert metrics["Retriever"]["precision"] == 1.0
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="answer")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.0
|
||||||
|
assert metrics["Retriever"]["map"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.0
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.0
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.5
|
||||||
|
assert metrics["Retriever"]["map"] == 0.5
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.1
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
|
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
|
||||||
|
def test_document_search_id_only_eval_document_scope(retriever_with_docs):
|
||||||
|
pipeline = DocumentSearchPipeline(retriever=retriever_with_docs)
|
||||||
|
eval_result: EvaluationResult = pipeline.eval(
|
||||||
|
labels=DOC_SEARCH_ID_EVAL_LABELS,
|
||||||
|
params={"Retriever": {"top_k": 5}},
|
||||||
|
context_matching_min_length=20, # artificially set down min_length to see if context matching is working properly
|
||||||
|
)
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="document_id")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.5
|
||||||
|
assert metrics["Retriever"]["map"] == 0.5
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.1
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="context")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.0
|
||||||
|
assert metrics["Retriever"]["map"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.0
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.0
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="document_id_and_context")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.0
|
||||||
|
assert metrics["Retriever"]["map"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.0
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.0
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="document_id_or_context")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.5
|
||||||
|
assert metrics["Retriever"]["map"] == 0.5
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.1
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="answer")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.0
|
||||||
|
assert metrics["Retriever"]["map"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.0
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.0
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.5
|
||||||
|
assert metrics["Retriever"]["map"] == 0.5
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.5
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 0.5
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.1
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
|
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
|
||||||
|
def test_file_search_eval_document_scope(retriever_with_docs):
|
||||||
|
pipeline = DocumentSearchPipeline(retriever=retriever_with_docs)
|
||||||
|
eval_result: EvaluationResult = pipeline.eval(
|
||||||
|
labels=FILE_SEARCH_EVAL_LABELS,
|
||||||
|
params={"Retriever": {"top_k": 5}},
|
||||||
|
context_matching_min_length=20, # artificially set down min_length to see if context matching is working properly
|
||||||
|
custom_document_id_field="name",
|
||||||
|
)
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="document_id")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.6
|
||||||
|
assert metrics["Retriever"]["map"] == 0.6
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.2
|
||||||
|
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.6934, 0.0001)
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="context")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.0
|
||||||
|
assert metrics["Retriever"]["map"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.0
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.0
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="document_id_and_context")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.0
|
||||||
|
assert metrics["Retriever"]["map"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.0
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.0
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="document_id_or_context")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.6
|
||||||
|
assert metrics["Retriever"]["map"] == 0.6
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.2
|
||||||
|
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.6934, 0.0001)
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="answer")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.0
|
||||||
|
assert metrics["Retriever"]["map"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 0.0
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.0
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 0.0
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer")
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 0.6
|
||||||
|
assert metrics["Retriever"]["map"] == 0.6
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
|
assert metrics["Retriever"]["precision"] == 0.2
|
||||||
|
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.6934, 0.0001)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
|
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"document_scope",
|
||||||
|
["document_id", "context", "document_id_and_context", "document_id_or_context", "answer", "document_id_or_answer"],
|
||||||
|
)
|
||||||
|
def test_extractive_qa_eval_document_scope_no_answer(retriever_with_docs, document_scope):
|
||||||
|
pipeline = DocumentSearchPipeline(retriever=retriever_with_docs)
|
||||||
|
eval_result: EvaluationResult = pipeline.eval(
|
||||||
|
labels=NO_ANSWER_EVAL_LABELS,
|
||||||
|
params={"Retriever": {"top_k": 5}},
|
||||||
|
context_matching_min_length=20, # artificially set down min_length to see if context matching is working properly
|
||||||
|
)
|
||||||
|
|
||||||
|
metrics = eval_result.calculate_metrics(document_scope=document_scope)
|
||||||
|
|
||||||
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
|
assert metrics["Retriever"]["precision"] == 1.0
|
||||||
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
@ -701,11 +1022,11 @@ def test_extractive_qa_eval_answer_scope(reader, retriever_with_docs):
|
|||||||
metrics = eval_result.calculate_metrics(answer_scope="any")
|
metrics = eval_result.calculate_metrics(answer_scope="any")
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == 0.75
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.75
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 0.2
|
assert metrics["Retriever"]["precision"] == 0.2
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
assert metrics["Reader"]["exact_match"] == 1.0
|
assert metrics["Reader"]["exact_match"] == 1.0
|
||||||
assert metrics["Reader"]["f1"] == 1.0
|
assert metrics["Reader"]["f1"] == 1.0
|
||||||
assert metrics["Reader"]["sas"] == pytest.approx(1.0)
|
assert metrics["Reader"]["sas"] == pytest.approx(1.0)
|
||||||
@ -713,11 +1034,11 @@ def test_extractive_qa_eval_answer_scope(reader, retriever_with_docs):
|
|||||||
metrics = eval_result.calculate_metrics(answer_scope="context")
|
metrics = eval_result.calculate_metrics(answer_scope="context")
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == 0.75
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.75
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 0.2
|
assert metrics["Retriever"]["precision"] == 0.2
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
assert metrics["Reader"]["exact_match"] == 1.0
|
assert metrics["Reader"]["exact_match"] == 1.0
|
||||||
assert metrics["Reader"]["f1"] == 1.0
|
assert metrics["Reader"]["f1"] == 1.0
|
||||||
assert metrics["Reader"]["sas"] == pytest.approx(1.0)
|
assert metrics["Reader"]["sas"] == pytest.approx(1.0)
|
||||||
|
@ -69,17 +69,17 @@ def test_summarizer_calculate_metrics(document_store_with_docs: ElasticsearchDoc
|
|||||||
assert len(eval_result) == 2
|
assert len(eval_result) == 2
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 1.0
|
assert metrics["Retriever"]["precision"] == 1.0
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
assert metrics["Summarizer"]["mrr"] == 1.0
|
assert metrics["Summarizer"]["mrr"] == 1.0
|
||||||
assert metrics["Summarizer"]["map"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Summarizer"]["map"] == 1.0
|
||||||
assert metrics["Summarizer"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Summarizer"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Summarizer"]["recall_single_hit"] == 1.0
|
assert metrics["Summarizer"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Summarizer"]["precision"] == 1.0
|
assert metrics["Summarizer"]["precision"] == 1.0
|
||||||
assert metrics["Summarizer"]["ndcg"] == pytest.approx(0.9461, 1e-4)
|
assert metrics["Summarizer"]["ndcg"] == 1.0
|
||||||
|
|
||||||
|
|
||||||
EVAL_LABELS = [
|
EVAL_LABELS = [
|
||||||
@ -285,9 +285,8 @@ def test_reader_eval_in_pipeline(reader):
|
|||||||
|
|
||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
|
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
|
||||||
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
|
def test_extractive_qa_eval_document_scope(retriever_with_docs):
|
||||||
def test_extractive_qa_eval_document_scope(reader, retriever_with_docs):
|
pipeline = DocumentSearchPipeline(retriever=retriever_with_docs)
|
||||||
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
|
|
||||||
eval_result: EvaluationResult = pipeline.eval_batch(
|
eval_result: EvaluationResult = pipeline.eval_batch(
|
||||||
labels=EVAL_LABELS,
|
labels=EVAL_LABELS,
|
||||||
params={"Retriever": {"top_k": 5}},
|
params={"Retriever": {"top_k": 5}},
|
||||||
@ -306,11 +305,11 @@ def test_extractive_qa_eval_document_scope(reader, retriever_with_docs):
|
|||||||
metrics = eval_result.calculate_metrics(document_scope="context")
|
metrics = eval_result.calculate_metrics(document_scope="context")
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 1.0
|
assert metrics["Retriever"]["precision"] == 1.0
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
metrics = eval_result.calculate_metrics(document_scope="document_id_and_context")
|
metrics = eval_result.calculate_metrics(document_scope="document_id_and_context")
|
||||||
|
|
||||||
@ -324,29 +323,29 @@ def test_extractive_qa_eval_document_scope(reader, retriever_with_docs):
|
|||||||
metrics = eval_result.calculate_metrics(document_scope="document_id_or_context")
|
metrics = eval_result.calculate_metrics(document_scope="document_id_or_context")
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4)
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 1.0
|
assert metrics["Retriever"]["precision"] == 1.0
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
metrics = eval_result.calculate_metrics(document_scope="answer")
|
metrics = eval_result.calculate_metrics(document_scope="answer")
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == 0.75
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.75
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 0.2
|
assert metrics["Retriever"]["precision"] == 0.2
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer")
|
metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer")
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == 0.75
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.75
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 0.2
|
assert metrics["Retriever"]["precision"] == 0.2
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
|
||||||
@ -364,11 +363,11 @@ def test_extractive_qa_eval_answer_scope(reader, retriever_with_docs):
|
|||||||
metrics = eval_result.calculate_metrics(answer_scope="any")
|
metrics = eval_result.calculate_metrics(answer_scope="any")
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == 0.75
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.75
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 0.2
|
assert metrics["Retriever"]["precision"] == 0.2
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
assert metrics["Reader"]["exact_match"] == 1.0
|
assert metrics["Reader"]["exact_match"] == 1.0
|
||||||
assert metrics["Reader"]["f1"] == 1.0
|
assert metrics["Reader"]["f1"] == 1.0
|
||||||
assert metrics["Reader"]["sas"] == pytest.approx(1.0)
|
assert metrics["Reader"]["sas"] == pytest.approx(1.0)
|
||||||
@ -376,11 +375,11 @@ def test_extractive_qa_eval_answer_scope(reader, retriever_with_docs):
|
|||||||
metrics = eval_result.calculate_metrics(answer_scope="context")
|
metrics = eval_result.calculate_metrics(answer_scope="context")
|
||||||
|
|
||||||
assert metrics["Retriever"]["mrr"] == 1.0
|
assert metrics["Retriever"]["mrr"] == 1.0
|
||||||
assert metrics["Retriever"]["map"] == 0.75
|
assert metrics["Retriever"]["map"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_multi_hit"] == 0.75
|
assert metrics["Retriever"]["recall_multi_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
assert metrics["Retriever"]["recall_single_hit"] == 1.0
|
||||||
assert metrics["Retriever"]["precision"] == 0.2
|
assert metrics["Retriever"]["precision"] == 0.2
|
||||||
assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4)
|
assert metrics["Retriever"]["ndcg"] == 1.0
|
||||||
assert metrics["Reader"]["exact_match"] == 1.0
|
assert metrics["Reader"]["exact_match"] == 1.0
|
||||||
assert metrics["Reader"]["f1"] == 1.0
|
assert metrics["Reader"]["f1"] == 1.0
|
||||||
assert metrics["Reader"]["sas"] == pytest.approx(1.0)
|
assert metrics["Reader"]["sas"] == pytest.approx(1.0)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user