diff --git a/haystack/nodes/reader/base.py b/haystack/nodes/reader/base.py index 454515191..0cc4a44f0 100644 --- a/haystack/nodes/reader/base.py +++ b/haystack/nodes/reader/base.py @@ -118,9 +118,13 @@ class BaseReader(BaseComponent): # run evaluation with labels as node inputs if add_isolated_node_eval and labels is not None: - relevant_documents = [label.document for label in labels.labels] - # Filter out empty documents - relevant_documents = [d for d in relevant_documents if d.content.strip() != ""] + # This dict comprehension deduplicates same Documents in a MultiLabel based on their Document ID and + # filters out empty documents + relevant_documents = list( + { + label.document.id: label.document for label in labels.labels if label.document.content.strip() != "" + }.values() + ) results_label_input = predict(query=query, documents=relevant_documents, top_k=top_k) # Add corresponding document_name and more meta data, if an answer contains the document_id @@ -174,10 +178,15 @@ class BaseReader(BaseComponent): if add_isolated_node_eval and labels is not None: relevant_documents = [] for labelx in labels: - # Filter out empty documents - relevant_docs_labelx = [ - label.document for label in labelx.labels if label.document.content.strip() != "" - ] + # This dict comprehension deduplicates same Documents in a MultiLabel based on their Document ID + # and filters out empty documents + relevant_docs_labelx = list( + { + label.document.id: label.document + for label in labelx.labels + if label.document.content.strip() != "" + }.values() + ) relevant_documents.append(relevant_docs_labelx) results_label_input = predict_batch(queries=queries, documents=relevant_documents, top_k=top_k) diff --git a/test/pipelines/test_eval.py b/test/pipelines/test_eval.py index bbb83656a..b51933f72 100644 --- a/test/pipelines/test_eval.py +++ b/test/pipelines/test_eval.py @@ -1,6 +1,7 @@ import logging import pytest import sys +from copy import deepcopy from haystack.document_stores.memory import InMemoryDocumentStore from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore from haystack.nodes.preprocessor import PreProcessor @@ -1267,9 +1268,14 @@ def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriev @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) @pytest.mark.parametrize("reader", ["farm"], indirect=True) def test_extractive_qa_eval_isolated(reader, retriever_with_docs): + labels = deepcopy(EVAL_LABELS) + # Copy one of the labels and change only the answer have a label with a different answer but same Document + label_copy = deepcopy(labels[0].labels[0]) + label_copy.answer = Answer(answer="I", offsets_in_context=[Span(21, 22)]) + labels[0].labels.append(label_copy) pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) eval_result: EvaluationResult = pipeline.eval( - labels=EVAL_LABELS, + labels=labels, sas_model_name_or_path="sentence-transformers/paraphrase-MiniLM-L3-v2", add_isolated_node_eval=True, ) @@ -1292,6 +1298,12 @@ def test_extractive_qa_eval_isolated(reader, retriever_with_docs): assert metrics_top_1["Reader"]["f1"] == 1.0 assert metrics_top_1["Reader"]["sas"] == pytest.approx(1.0, abs=1e-4) + # Check if same Document in MultiLabel got deduplicated + assert labels[0].labels[0].id == labels[0].labels[1].id + reader_eval_df = eval_result.node_results["Reader"] + isolated_reader_eval_df = reader_eval_df[reader_eval_df["eval_mode"] == "isolated"] + assert len(isolated_reader_eval_df) == len(labels) * reader.top_k_per_candidate + @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) diff --git a/test/pipelines/test_eval_batch.py b/test/pipelines/test_eval_batch.py index bcb00e61c..0dac41075 100644 --- a/test/pipelines/test_eval_batch.py +++ b/test/pipelines/test_eval_batch.py @@ -1,6 +1,7 @@ import logging import pytest import sys +from copy import deepcopy from haystack.document_stores.memory import InMemoryDocumentStore from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore from haystack.nodes.preprocessor import PreProcessor @@ -607,9 +608,14 @@ def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriev @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) @pytest.mark.parametrize("reader", ["farm"], indirect=True) def test_extractive_qa_eval_isolated(reader, retriever_with_docs): + labels = deepcopy(EVAL_LABELS) + # Copy one of the labels and change only the answer have a label with a different answer but same Document + label_copy = deepcopy(labels[0].labels[0]) + label_copy.answer = Answer(answer="I", offsets_in_context=[Span(21, 22)]) + labels[0].labels.append(label_copy) pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) eval_result: EvaluationResult = pipeline.eval_batch( - labels=EVAL_LABELS, + labels=labels, sas_model_name_or_path="sentence-transformers/paraphrase-MiniLM-L3-v2", add_isolated_node_eval=True, ) @@ -632,6 +638,12 @@ def test_extractive_qa_eval_isolated(reader, retriever_with_docs): assert metrics_top_1["Reader"]["f1"] == 1.0 assert metrics_top_1["Reader"]["sas"] == pytest.approx(1.0, abs=1e-4) + # Check if same Document in MultiLabel got deduplicated + assert labels[0].labels[0].id == labels[0].labels[1].id + reader_eval_df = eval_result.node_results["Reader"] + isolated_reader_eval_df = reader_eval_df[reader_eval_df["eval_mode"] == "isolated"] + assert len(isolated_reader_eval_df) == len(labels) * reader.top_k_per_candidate + @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)