diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index 6945663ce..e6065ae8c 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -1413,7 +1413,7 @@ class Pipeline: "multilabel_id", # generic "query", # generic "filters", # generic - "gold_answers", # answer-specific + "gold_answers", # generic "answer", # answer-specific "context", # generic "exact_match", # answer-specific @@ -1690,6 +1690,7 @@ class Pipeline: df_docs.map_rows = partial(df_docs.apply, axis=1) df_docs.rename(columns={"id": "document_id", "content": "context"}, inplace=True) df_docs["gold_document_ids"] = [gold_document_ids] * len(df_docs) + df_docs["gold_answers"] = [gold_answers] * len(df_docs) df_docs["gold_contexts"] = [gold_contexts] * len(df_docs) df_docs["gold_contexts_similarity"] = df_docs.map_rows( lambda row: [ @@ -1740,7 +1741,12 @@ class Pipeline: # document_relevance_criterion: "document_id_and_answer", df_docs["gold_id_and_answer_match"] = df_docs.map_rows( - lambda row: min(row["gold_id_match"], row["answer_match"]) + lambda row: max( + min(id_match, answer_match) + for id_match, answer_match in zip( + row["gold_documents_id_match"] + [0.0], row["gold_answers_match"] + [0.0] + ) + ) ) # document_relevance_criterion: "context", @@ -1757,17 +1763,36 @@ class Pipeline: # document_relevance_criterion: "document_id_and_context", df_docs["gold_id_and_context_match"] = df_docs.map_rows( - lambda row: min(row["gold_id_match"], row["context_match"]) + lambda row: max( + min(id_match, 1.0 if context_similarity > context_matching_threshold else 0.0) + for id_match, context_similarity in zip( + row["gold_documents_id_match"] + [0.0], row["gold_contexts_similarity"] + [0.0] + ) + ) ) # document_relevance_criterion: "document_id_and_context_and_answer", df_docs["gold_id_and_context_and_answer_match"] = df_docs.map_rows( - lambda row: min(row["gold_id_match"], row["context_match"], row["answer_match"]) + lambda row: max( + min(id_match, 1.0 if context_similarity > context_matching_threshold else 0.0, answer_match) + for id_match, context_similarity, answer_match in zip( + row["gold_documents_id_match"] + [0.0], + row["gold_contexts_similarity"] + [0.0], + row["gold_answers_match"] + [0.0], + ) + ) ) # document_relevance_criterion: "context_and_answer", df_docs["context_and_answer_match"] = df_docs.map_rows( - lambda row: min(row["context_match"], row["answer_match"]) + lambda row: max( + min(1.0 if context_similarity > context_matching_threshold else 0.0, answer_match) + for context_similarity, answer_match in zip( + row["gold_contexts_similarity"], row["gold_answers_match"] + ) + ) + if any(row["gold_answers_match"]) and any(row["gold_contexts_similarity"]) + else 0.0 ) df_docs["rank"] = np.arange(1, len(df_docs) + 1) diff --git a/haystack/schema.py b/haystack/schema.py index 897bd16fe..c808b180e 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -1436,36 +1436,103 @@ class EvaluationResult: if simulated_top_k_retriever != -1: documents = documents[documents["rank"] <= simulated_top_k_retriever] + # find out which label matched + def find_matched_label_idxs(row) -> List[int]: # pylint: disable=too-many-return-statements + id_matches = [idx for idx, val in enumerate(row["gold_documents_id_match"]) if val == 1.0] + context_matches = [ + idx for idx, val in enumerate(row["gold_contexts_similarity"]) if val > 65.0 + ] # TODO: hardcoded threshold for now, will be param of calculate_metrics + answer_matches = [idx for idx, val in enumerate(row["gold_answers_match"]) if val == 1.0] + if document_relevance_criterion == "document_id": + return id_matches + elif document_relevance_criterion == "context": + return context_matches + elif document_relevance_criterion == "answer": + return answer_matches + elif document_relevance_criterion == "document_id_and_context": + return list(set(id_matches) & set(context_matches)) + elif document_relevance_criterion == "document_id_or_context": + return list(set(id_matches) | set(context_matches)) + elif document_relevance_criterion == "document_id_and_answer": + return list(set(id_matches) & set(answer_matches)) + elif document_relevance_criterion == "document_id_or_answer": + return list(set(id_matches) | set(answer_matches)) + elif document_relevance_criterion == "context_and_answer": + return list(set(context_matches) & set(answer_matches)) + elif document_relevance_criterion == "document_id_and_context_and_answer": + return list(set(id_matches) & set(context_matches) & set(answer_matches)) + else: + raise ValueError(f"document_relevance_criterion '{document_relevance_criterion}' not supported.") + + documents["matched_label_idxs"] = documents.apply(find_matched_label_idxs, axis=1) + metrics = [] for multilabel_id in documents["multilabel_id"].unique(): query_df = documents[documents["multilabel_id"] == multilabel_id] - gold_ids = list(query_df["gold_document_ids"].iloc[0]) - retrieved = len(query_df) + # Note: Metrics are always calculated on document_ids. + # For some document relevance criteria (e.g. context), the gold_document_ids are not enough or not useful at all. + # So, we have to adjust the relevant ids according to the document_relevance_criterion. relevance_criterion_col = f"{document_relevance_criterion.replace('document_id', 'gold_id')}_match" - relevance_criterion_ids = list(query_df[query_df[relevance_criterion_col] == 1]["document_id"].values) - num_relevants = len(set(gold_ids + relevance_criterion_ids)) - num_retrieved_relevants = query_df[relevance_criterion_col].values.sum() - rank_retrieved_relevants = query_df[query_df[relevance_criterion_col] == 1]["rank"].values - avp_retrieved_relevants = [ - query_df[relevance_criterion_col].values[: int(rank)].sum() / rank for rank in rank_retrieved_relevants - ] + relevant_rows = query_df[query_df[relevance_criterion_col] == 1] - avg_precision = np.sum(avp_retrieved_relevants) / num_relevants if num_relevants > 0 else 0.0 - recall_multi_hit = num_retrieved_relevants / num_relevants if num_relevants > 0 else 1.0 - recall_single_hit = min(num_retrieved_relevants, 1) if num_relevants > 0 else 1.0 - precision = num_retrieved_relevants / retrieved if retrieved > 0 else 0.0 - rr = 1.0 / rank_retrieved_relevants.min() if len(rank_retrieved_relevants) > 0 else 0.0 - dcg = ( - np.sum([1.0 / np.log2(rank + 1) for rank in rank_retrieved_relevants]) - if len(rank_retrieved_relevants) > 0 - else 0.0 + # all labels without no_answers + # we need to match all (except for single hit recall) + gold_document_ids = ( + list(query_df["gold_custom_document_ids"].iloc[0]) + if "gold_custom_document_ids" in query_df + else list(query_df["gold_document_ids"].iloc[0]) ) - idcg = ( - np.sum([1.0 / np.log2(rank + 1) for rank in range(1, num_relevants + 1)]) if num_relevants > 0 else 1.0 - ) - ndcg = dcg / idcg + # remove no_answer label + gold_document_ids = [id for id in gold_document_ids if id != "00"] + + num_labels = len(gold_document_ids) + num_matched_labels = len(set(idx for idxs in relevant_rows["matched_label_idxs"] for idx in idxs)) + num_missing_labels = num_labels - num_matched_labels + + relevance_criterion_ids = list(relevant_rows["document_id"].values) + num_relevants = len(set(relevance_criterion_ids)) + num_missing_labels + + num_retrieved = len(query_df["document_id"]) + num_retrieved_relevants = len(relevant_rows) + rank_retrieved_relevants = relevant_rows["rank"].values + + if num_labels == 0: + # For no_answer queries, we set all metrics to 1.0, to indicate that the retriever cannot improve the pipeline. + # This behavior is different from pytrec_eval, which sets the metrics to 0.0 if there is no relevant document in the evalset. + rr = 1.0 + avg_precision = 1.0 + recall_multi_hit = 1.0 + recall_single_hit = 1.0 + precision = 1.0 + ndcg = 1.0 + elif num_retrieved_relevants == 0: + # Set all metrics to 0.0 if no relevant document has been retrieved to avoid undefined metrics. + rr = 0.0 + avg_precision = 0.0 + recall_multi_hit = 0.0 + recall_single_hit = 0.0 + precision = 0.0 + ndcg = 0.0 + else: + # The previous checks ensure: + # - `num_labels` > 0 + # - `num_retrieved_relevants` > 0 + # - `num_relevants` > 0 (`num_relevants` is always >= `num_labels`) + # - `num_retrieved` > 0 (`num_retrieved` is always >= `num_retrieved_relevants`) + # - `len(rank_retrieved_relevants)` > 0 (`len(rank_retrieved_relevants)` is always == `num_retrieved_relevants`) + avp_retrieved_relevants = [ + len(relevant_rows[relevant_rows["rank"] <= rank]) / rank for rank in rank_retrieved_relevants + ] + avg_precision = np.sum(avp_retrieved_relevants) / num_relevants + recall_multi_hit = num_matched_labels / num_labels + recall_single_hit = 1.0 + precision = num_retrieved_relevants / num_retrieved + rr = 1.0 / rank_retrieved_relevants.min() + dcg = np.sum([1.0 / np.log2(rank + 1) for rank in rank_retrieved_relevants]) + idcg = np.sum([1.0 / np.log2(rank + 1) for rank in range(1, num_relevants + 1)]) + ndcg = dcg / idcg metrics.append( { diff --git a/test/pipelines/test_eval.py b/test/pipelines/test_eval.py index b69b04e64..add0dbc50 100644 --- a/test/pipelines/test_eval.py +++ b/test/pipelines/test_eval.py @@ -69,17 +69,17 @@ def test_summarizer_calculate_metrics(document_store_with_docs: ElasticsearchDoc assert len(eval_result) == 2 assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4) - assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4) + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 1.0 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 assert metrics["Summarizer"]["mrr"] == 1.0 - assert metrics["Summarizer"]["map"] == pytest.approx(0.9167, 1e-4) - assert metrics["Summarizer"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4) + assert metrics["Summarizer"]["map"] == 1.0 + assert metrics["Summarizer"]["recall_multi_hit"] == 1.0 assert metrics["Summarizer"]["recall_single_hit"] == 1.0 assert metrics["Summarizer"]["precision"] == 1.0 - assert metrics["Summarizer"]["ndcg"] == pytest.approx(0.9461, 1e-4) + assert metrics["Summarizer"]["ndcg"] == 1.0 @pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True) @@ -304,6 +304,108 @@ EVAL_LABELS = [ ), ] +NO_ANSWER_EVAL_LABELS = [ + MultiLabel( + labels=[ + Label( + query="Why does probability work?", + document=Document(""), + answer=None, + is_correct_answer=True, + is_correct_document=True, + origin="gold-label", + ) + ] + ) +] + +DOC_SEARCH_EVAL_LABELS = [ + MultiLabel( + labels=[ + Label( + query="Who lives in Berlin?", + answer=None, + document=Document( + id="a0747b83aea0b60c4b114b15476dd32d", + content_type="text", + content="My name is Carla and I live in Berlin", + ), + is_correct_answer=False, + is_correct_document=True, + origin="gold-label", + ) + ] + ), + MultiLabel( + labels=[ + Label( + query="Who lives in Munich?", + answer=None, + document=Document( + id="something_else", content_type="text", content="My name is Carla and I live in Munich" + ), + is_correct_answer=False, + is_correct_document=True, + origin="gold-label", + ) + ] + ), +] + +DOC_SEARCH_ID_EVAL_LABELS = [ + MultiLabel( + labels=[ + Label( + query="Who lives in Berlin?", + answer=None, + document=Document(id="a0747b83aea0b60c4b114b15476dd32d", content_type="text", content=""), + is_correct_answer=False, + is_correct_document=True, + origin="gold-label", + ) + ] + ), + MultiLabel( + labels=[ + Label( + query="Who lives in Munich?", + answer=None, + document=Document(id="something_else", content_type="text", content=""), + is_correct_answer=False, + is_correct_document=True, + origin="gold-label", + ) + ] + ), +] + +FILE_SEARCH_EVAL_LABELS = [ + MultiLabel( + labels=[ + Label( + query="Who lives in Berlin?", + answer=None, + document=Document(content_type="text", content="", meta={"name": "filename1"}), + is_correct_answer=False, + is_correct_document=True, + origin="gold-label", + ) + ] + ), + MultiLabel( + labels=[ + Label( + query="Who lives in Munich?", + answer=None, + document=Document(content_type="text", content="", meta={"name": "filename2"}), + is_correct_answer=False, + is_correct_document=True, + origin="gold-label", + ) + ] + ), +] + @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) @@ -320,7 +422,6 @@ def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path): retriever_result = eval_result["Retriever"] expected_reader_result_columns = [ - "gold_answers", # answer-specific "answer", # answer-specific "exact_match", # answer-specific "f1", # answer-specific @@ -370,6 +471,7 @@ def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path): "rank", # generic "document_id", # generic "gold_document_ids", # generic + "gold_answers", # generic # "custom_document_id", # generic optional # "gold_custom_document_ids", # generic optional ] @@ -622,9 +724,8 @@ def test_reader_eval_in_pipeline(reader): @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) -@pytest.mark.parametrize("reader", ["farm"], indirect=True) -def test_extractive_qa_eval_document_scope(reader, retriever_with_docs): - pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) +def test_extractive_qa_eval_document_scope(retriever_with_docs): + pipeline = DocumentSearchPipeline(retriever=retriever_with_docs) eval_result: EvaluationResult = pipeline.eval( labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}, @@ -643,11 +744,11 @@ def test_extractive_qa_eval_document_scope(reader, retriever_with_docs): metrics = eval_result.calculate_metrics(document_scope="context") assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4) - assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4) + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 1.0 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 metrics = eval_result.calculate_metrics(document_scope="document_id_and_context") @@ -661,29 +762,249 @@ def test_extractive_qa_eval_document_scope(reader, retriever_with_docs): metrics = eval_result.calculate_metrics(document_scope="document_id_or_context") assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4) - assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4) + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 1.0 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 metrics = eval_result.calculate_metrics(document_scope="answer") assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == 0.75 - assert metrics["Retriever"]["recall_multi_hit"] == 0.75 + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 0.2 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer") assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == 0.75 - assert metrics["Retriever"]["recall_multi_hit"] == 0.75 + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 0.2 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +def test_document_search_eval_document_scope(retriever_with_docs): + pipeline = DocumentSearchPipeline(retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval( + labels=DOC_SEARCH_EVAL_LABELS, + params={"Retriever": {"top_k": 5}}, + context_matching_min_length=20, # artificially set down min_length to see if context matching is working properly + ) + + metrics = eval_result.calculate_metrics(document_scope="document_id") + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + + metrics = eval_result.calculate_metrics(document_scope="context") + + assert metrics["Retriever"]["mrr"] == 1.0 + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 1.0 + assert metrics["Retriever"]["ndcg"] == 1.0 + + metrics = eval_result.calculate_metrics(document_scope="document_id_and_context") + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + + metrics = eval_result.calculate_metrics(document_scope="document_id_or_context") + + assert metrics["Retriever"]["mrr"] == 1.0 + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 1.0 + assert metrics["Retriever"]["ndcg"] == 1.0 + + metrics = eval_result.calculate_metrics(document_scope="answer") + + assert metrics["Retriever"]["mrr"] == 0.0 + assert metrics["Retriever"]["map"] == 0.0 + assert metrics["Retriever"]["recall_multi_hit"] == 0.0 + assert metrics["Retriever"]["recall_single_hit"] == 0.0 + assert metrics["Retriever"]["precision"] == 0.0 + assert metrics["Retriever"]["ndcg"] == 0.0 + + metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer") + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +def test_document_search_id_only_eval_document_scope(retriever_with_docs): + pipeline = DocumentSearchPipeline(retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval( + labels=DOC_SEARCH_ID_EVAL_LABELS, + params={"Retriever": {"top_k": 5}}, + context_matching_min_length=20, # artificially set down min_length to see if context matching is working properly + ) + + metrics = eval_result.calculate_metrics(document_scope="document_id") + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + + metrics = eval_result.calculate_metrics(document_scope="context") + + assert metrics["Retriever"]["mrr"] == 0.0 + assert metrics["Retriever"]["map"] == 0.0 + assert metrics["Retriever"]["recall_multi_hit"] == 0.0 + assert metrics["Retriever"]["recall_single_hit"] == 0.0 + assert metrics["Retriever"]["precision"] == 0.0 + assert metrics["Retriever"]["ndcg"] == 0.0 + + metrics = eval_result.calculate_metrics(document_scope="document_id_and_context") + + assert metrics["Retriever"]["mrr"] == 0.0 + assert metrics["Retriever"]["map"] == 0.0 + assert metrics["Retriever"]["recall_multi_hit"] == 0.0 + assert metrics["Retriever"]["recall_single_hit"] == 0.0 + assert metrics["Retriever"]["precision"] == 0.0 + assert metrics["Retriever"]["ndcg"] == 0.0 + + metrics = eval_result.calculate_metrics(document_scope="document_id_or_context") + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + + metrics = eval_result.calculate_metrics(document_scope="answer") + + assert metrics["Retriever"]["mrr"] == 0.0 + assert metrics["Retriever"]["map"] == 0.0 + assert metrics["Retriever"]["recall_multi_hit"] == 0.0 + assert metrics["Retriever"]["recall_single_hit"] == 0.0 + assert metrics["Retriever"]["precision"] == 0.0 + assert metrics["Retriever"]["ndcg"] == 0.0 + + metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer") + + assert metrics["Retriever"]["mrr"] == 0.5 + assert metrics["Retriever"]["map"] == 0.5 + assert metrics["Retriever"]["recall_multi_hit"] == 0.5 + assert metrics["Retriever"]["recall_single_hit"] == 0.5 + assert metrics["Retriever"]["precision"] == 0.1 + assert metrics["Retriever"]["ndcg"] == 0.5 + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +def test_file_search_eval_document_scope(retriever_with_docs): + pipeline = DocumentSearchPipeline(retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval( + labels=FILE_SEARCH_EVAL_LABELS, + params={"Retriever": {"top_k": 5}}, + context_matching_min_length=20, # artificially set down min_length to see if context matching is working properly + custom_document_id_field="name", + ) + + metrics = eval_result.calculate_metrics(document_scope="document_id") + + assert metrics["Retriever"]["mrr"] == 0.6 + assert metrics["Retriever"]["map"] == 0.6 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 0.2 + assert metrics["Retriever"]["ndcg"] == pytest.approx(0.6934, 0.0001) + + metrics = eval_result.calculate_metrics(document_scope="context") + + assert metrics["Retriever"]["mrr"] == 0.0 + assert metrics["Retriever"]["map"] == 0.0 + assert metrics["Retriever"]["recall_multi_hit"] == 0.0 + assert metrics["Retriever"]["recall_single_hit"] == 0.0 + assert metrics["Retriever"]["precision"] == 0.0 + assert metrics["Retriever"]["ndcg"] == 0.0 + + metrics = eval_result.calculate_metrics(document_scope="document_id_and_context") + + assert metrics["Retriever"]["mrr"] == 0.0 + assert metrics["Retriever"]["map"] == 0.0 + assert metrics["Retriever"]["recall_multi_hit"] == 0.0 + assert metrics["Retriever"]["recall_single_hit"] == 0.0 + assert metrics["Retriever"]["precision"] == 0.0 + assert metrics["Retriever"]["ndcg"] == 0.0 + + metrics = eval_result.calculate_metrics(document_scope="document_id_or_context") + + assert metrics["Retriever"]["mrr"] == 0.6 + assert metrics["Retriever"]["map"] == 0.6 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 0.2 + assert metrics["Retriever"]["ndcg"] == pytest.approx(0.6934, 0.0001) + + metrics = eval_result.calculate_metrics(document_scope="answer") + + assert metrics["Retriever"]["mrr"] == 0.0 + assert metrics["Retriever"]["map"] == 0.0 + assert metrics["Retriever"]["recall_multi_hit"] == 0.0 + assert metrics["Retriever"]["recall_single_hit"] == 0.0 + assert metrics["Retriever"]["precision"] == 0.0 + assert metrics["Retriever"]["ndcg"] == 0.0 + + metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer") + + assert metrics["Retriever"]["mrr"] == 0.6 + assert metrics["Retriever"]["map"] == 0.6 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 0.2 + assert metrics["Retriever"]["ndcg"] == pytest.approx(0.6934, 0.0001) + + +@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) +@pytest.mark.parametrize( + "document_scope", + ["document_id", "context", "document_id_and_context", "document_id_or_context", "answer", "document_id_or_answer"], +) +def test_extractive_qa_eval_document_scope_no_answer(retriever_with_docs, document_scope): + pipeline = DocumentSearchPipeline(retriever=retriever_with_docs) + eval_result: EvaluationResult = pipeline.eval( + labels=NO_ANSWER_EVAL_LABELS, + params={"Retriever": {"top_k": 5}}, + context_matching_min_length=20, # artificially set down min_length to see if context matching is working properly + ) + + metrics = eval_result.calculate_metrics(document_scope=document_scope) + + assert metrics["Retriever"]["mrr"] == 1.0 + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 + assert metrics["Retriever"]["recall_single_hit"] == 1.0 + assert metrics["Retriever"]["precision"] == 1.0 + assert metrics["Retriever"]["ndcg"] == 1.0 @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) @@ -701,11 +1022,11 @@ def test_extractive_qa_eval_answer_scope(reader, retriever_with_docs): metrics = eval_result.calculate_metrics(answer_scope="any") assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == 0.75 - assert metrics["Retriever"]["recall_multi_hit"] == 0.75 + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 0.2 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 assert metrics["Reader"]["exact_match"] == 1.0 assert metrics["Reader"]["f1"] == 1.0 assert metrics["Reader"]["sas"] == pytest.approx(1.0) @@ -713,11 +1034,11 @@ def test_extractive_qa_eval_answer_scope(reader, retriever_with_docs): metrics = eval_result.calculate_metrics(answer_scope="context") assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == 0.75 - assert metrics["Retriever"]["recall_multi_hit"] == 0.75 + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 0.2 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 assert metrics["Reader"]["exact_match"] == 1.0 assert metrics["Reader"]["f1"] == 1.0 assert metrics["Reader"]["sas"] == pytest.approx(1.0) diff --git a/test/pipelines/test_eval_batch.py b/test/pipelines/test_eval_batch.py index 6b66aa543..fcdf4f6ba 100644 --- a/test/pipelines/test_eval_batch.py +++ b/test/pipelines/test_eval_batch.py @@ -69,17 +69,17 @@ def test_summarizer_calculate_metrics(document_store_with_docs: ElasticsearchDoc assert len(eval_result) == 2 assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4) - assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4) + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 1.0 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 assert metrics["Summarizer"]["mrr"] == 1.0 - assert metrics["Summarizer"]["map"] == pytest.approx(0.9167, 1e-4) - assert metrics["Summarizer"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4) + assert metrics["Summarizer"]["map"] == 1.0 + assert metrics["Summarizer"]["recall_multi_hit"] == 1.0 assert metrics["Summarizer"]["recall_single_hit"] == 1.0 assert metrics["Summarizer"]["precision"] == 1.0 - assert metrics["Summarizer"]["ndcg"] == pytest.approx(0.9461, 1e-4) + assert metrics["Summarizer"]["ndcg"] == 1.0 EVAL_LABELS = [ @@ -285,9 +285,8 @@ def test_reader_eval_in_pipeline(reader): @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) -@pytest.mark.parametrize("reader", ["farm"], indirect=True) -def test_extractive_qa_eval_document_scope(reader, retriever_with_docs): - pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) +def test_extractive_qa_eval_document_scope(retriever_with_docs): + pipeline = DocumentSearchPipeline(retriever=retriever_with_docs) eval_result: EvaluationResult = pipeline.eval_batch( labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}, @@ -306,11 +305,11 @@ def test_extractive_qa_eval_document_scope(reader, retriever_with_docs): metrics = eval_result.calculate_metrics(document_scope="context") assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4) - assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4) + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 1.0 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 metrics = eval_result.calculate_metrics(document_scope="document_id_and_context") @@ -324,29 +323,29 @@ def test_extractive_qa_eval_document_scope(reader, retriever_with_docs): metrics = eval_result.calculate_metrics(document_scope="document_id_or_context") assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == pytest.approx(0.9167, 1e-4) - assert metrics["Retriever"]["recall_multi_hit"] == pytest.approx(0.9167, 1e-4) + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 1.0 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.9461, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 metrics = eval_result.calculate_metrics(document_scope="answer") assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == 0.75 - assert metrics["Retriever"]["recall_multi_hit"] == 0.75 + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 0.2 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 metrics = eval_result.calculate_metrics(document_scope="document_id_or_answer") assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == 0.75 - assert metrics["Retriever"]["recall_multi_hit"] == 0.75 + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 0.2 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) @@ -364,11 +363,11 @@ def test_extractive_qa_eval_answer_scope(reader, retriever_with_docs): metrics = eval_result.calculate_metrics(answer_scope="any") assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == 0.75 - assert metrics["Retriever"]["recall_multi_hit"] == 0.75 + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 0.2 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 assert metrics["Reader"]["exact_match"] == 1.0 assert metrics["Reader"]["f1"] == 1.0 assert metrics["Reader"]["sas"] == pytest.approx(1.0) @@ -376,11 +375,11 @@ def test_extractive_qa_eval_answer_scope(reader, retriever_with_docs): metrics = eval_result.calculate_metrics(answer_scope="context") assert metrics["Retriever"]["mrr"] == 1.0 - assert metrics["Retriever"]["map"] == 0.75 - assert metrics["Retriever"]["recall_multi_hit"] == 0.75 + assert metrics["Retriever"]["map"] == 1.0 + assert metrics["Retriever"]["recall_multi_hit"] == 1.0 assert metrics["Retriever"]["recall_single_hit"] == 1.0 assert metrics["Retriever"]["precision"] == 0.2 - assert metrics["Retriever"]["ndcg"] == pytest.approx(0.8066, 1e-4) + assert metrics["Retriever"]["ndcg"] == 1.0 assert metrics["Reader"]["exact_match"] == 1.0 assert metrics["Reader"]["f1"] == 1.0 assert metrics["Reader"]["sas"] == pytest.approx(1.0)