mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-11-04 03:39:31 +00:00 
			
		
		
		
	Remove wrong retriever top_1 metrics from print_eval_report (#2510)
				
					
				
			* remove wrong retriever top_1 metrics * Update Documentation & Code Style * don't show wrong examples frame when n_wrong_examples is 0 * Update Documentation & Code Style * Update Documentation & Code Style * only use farm reader during eval tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									738e008020
								
							
						
					
					
						commit
						771ed0bb1d
					
				@ -75,6 +75,34 @@
 | 
				
			|||||||
        }
 | 
					        }
 | 
				
			||||||
      ]
 | 
					      ]
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      "allOf": [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					          "properties": {
 | 
				
			||||||
 | 
					            "version": {
 | 
				
			||||||
 | 
					              "const": "1.4.0"
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					          }
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					          "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.4.0.schema.json"
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      ]
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      "allOf": [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					          "properties": {
 | 
				
			||||||
 | 
					            "version": {
 | 
				
			||||||
 | 
					              "const": "1.4.0"
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					          }
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					          "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.4.0.schema.json"
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      ]
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      "allOf": [
 | 
					      "allOf": [
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
 | 
				
			|||||||
@ -179,9 +179,13 @@ def print_eval_report(
 | 
				
			|||||||
        logger.warning("Pipelines with junctions are currently not supported.")
 | 
					        logger.warning("Pipelines with junctions are currently not supported.")
 | 
				
			||||||
        return
 | 
					        return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    answer_nodes = {node for node, df in eval_result.node_results.items() if len(df[df["type"] == "answer"]) > 0}
 | 
				
			||||||
 | 
					    all_top_1_metrics = eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, simulated_top_k_reader=1)
 | 
				
			||||||
 | 
					    answer_top_1_metrics = {node: metrics for node, metrics in all_top_1_metrics.items() if node in answer_nodes}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    calculated_metrics = {
 | 
					    calculated_metrics = {
 | 
				
			||||||
        "": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col),
 | 
					        "": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col),
 | 
				
			||||||
        "_top_1": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, simulated_top_k_reader=1),
 | 
					        "_top_1": answer_top_1_metrics,
 | 
				
			||||||
        " upper bound": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, eval_mode="isolated"),
 | 
					        " upper bound": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, eval_mode="isolated"),
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -242,7 +246,9 @@ def _format_wrong_examples_report(eval_result: EvaluationResult, n_wrong_example
 | 
				
			|||||||
        node: eval_result.wrong_examples(node, doc_relevance_col="gold_id_or_answer_match", n=n_wrong_examples)
 | 
					        node: eval_result.wrong_examples(node, doc_relevance_col="gold_id_or_answer_match", n=n_wrong_examples)
 | 
				
			||||||
        for node in eval_result.node_results.keys()
 | 
					        for node in eval_result.node_results.keys()
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    examples_formatted = {node: "\n".join(map(_format_wrong_example, examples)) for node, examples in examples.items()}
 | 
					    examples_formatted = {
 | 
				
			||||||
 | 
					        node: "\n".join(map(_format_wrong_example, examples)) for node, examples in examples.items() if any(examples)
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return "\n".join(map(_format_wrong_examples_node, examples_formatted.keys(), examples_formatted.values()))
 | 
					    return "\n".join(map(_format_wrong_examples_node, examples_formatted.keys(), examples_formatted.values()))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -305,6 +305,7 @@ EVAL_LABELS = [
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
					@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
				
			||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
					@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path):
 | 
					def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path):
 | 
				
			||||||
    labels = EVAL_LABELS[:1]
 | 
					    labels = EVAL_LABELS[:1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -357,6 +358,7 @@ def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
					@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
				
			||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
					@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_extractive_qa_eval_multiple_queries(reader, retriever_with_docs, tmp_path):
 | 
					def test_extractive_qa_eval_multiple_queries(reader, retriever_with_docs, tmp_path):
 | 
				
			||||||
    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
					    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
				
			||||||
    eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}})
 | 
					    eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}})
 | 
				
			||||||
@ -429,6 +431,7 @@ def test_extractive_qa_eval_multiple_queries(reader, retriever_with_docs, tmp_pa
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True)
 | 
					@pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True)
 | 
				
			||||||
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
 | 
					@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_extractive_qa_labels_with_filters(reader, retriever_with_docs, tmp_path):
 | 
					def test_extractive_qa_labels_with_filters(reader, retriever_with_docs, tmp_path):
 | 
				
			||||||
    labels = [
 | 
					    labels = [
 | 
				
			||||||
        # MultiLabel with filter that selects only the document about Carla
 | 
					        # MultiLabel with filter that selects only the document about Carla
 | 
				
			||||||
@ -498,6 +501,7 @@ def test_extractive_qa_labels_with_filters(reader, retriever_with_docs, tmp_path
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
					@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
				
			||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
					@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_extractive_qa_eval_sas(reader, retriever_with_docs):
 | 
					def test_extractive_qa_eval_sas(reader, retriever_with_docs):
 | 
				
			||||||
    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
					    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
				
			||||||
    eval_result: EvaluationResult = pipeline.eval(
 | 
					    eval_result: EvaluationResult = pipeline.eval(
 | 
				
			||||||
@ -520,6 +524,7 @@ def test_extractive_qa_eval_sas(reader, retriever_with_docs):
 | 
				
			|||||||
    assert metrics["Reader"]["sas"] == pytest.approx(1.0)
 | 
					    assert metrics["Reader"]["sas"] == pytest.approx(1.0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_reader_eval_in_pipeline(reader):
 | 
					def test_reader_eval_in_pipeline(reader):
 | 
				
			||||||
    pipeline = Pipeline()
 | 
					    pipeline = Pipeline()
 | 
				
			||||||
    pipeline.add_node(component=reader, name="Reader", inputs=["Query"])
 | 
					    pipeline.add_node(component=reader, name="Reader", inputs=["Query"])
 | 
				
			||||||
@ -537,6 +542,7 @@ def test_reader_eval_in_pipeline(reader):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
					@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
				
			||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
					@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_extractive_qa_eval_doc_relevance_col(reader, retriever_with_docs):
 | 
					def test_extractive_qa_eval_doc_relevance_col(reader, retriever_with_docs):
 | 
				
			||||||
    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
					    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
				
			||||||
    eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}})
 | 
					    eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}})
 | 
				
			||||||
@ -553,6 +559,7 @@ def test_extractive_qa_eval_doc_relevance_col(reader, retriever_with_docs):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
					@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
				
			||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
					@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs):
 | 
					def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs):
 | 
				
			||||||
    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
					    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
				
			||||||
    eval_result: EvaluationResult = pipeline.eval(
 | 
					    eval_result: EvaluationResult = pipeline.eval(
 | 
				
			||||||
@ -600,6 +607,7 @@ def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
					@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
				
			||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
					@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_docs):
 | 
					def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_docs):
 | 
				
			||||||
    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
					    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
				
			||||||
    eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}})
 | 
					    eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}})
 | 
				
			||||||
@ -651,6 +659,7 @@ def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_doc
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
					@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
				
			||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
					@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriever_with_docs):
 | 
					def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriever_with_docs):
 | 
				
			||||||
    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
					    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
				
			||||||
    eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 10}})
 | 
					    eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 10}})
 | 
				
			||||||
@ -709,6 +718,7 @@ def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriev
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
					@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
				
			||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
					@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_extractive_qa_eval_isolated(reader, retriever_with_docs):
 | 
					def test_extractive_qa_eval_isolated(reader, retriever_with_docs):
 | 
				
			||||||
    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
					    pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
				
			||||||
    eval_result: EvaluationResult = pipeline.eval(
 | 
					    eval_result: EvaluationResult = pipeline.eval(
 | 
				
			||||||
@ -738,6 +748,7 @@ def test_extractive_qa_eval_isolated(reader, retriever_with_docs):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
					@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
				
			||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
					@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_extractive_qa_eval_wrong_examples(reader, retriever_with_docs):
 | 
					def test_extractive_qa_eval_wrong_examples(reader, retriever_with_docs):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    labels = [
 | 
					    labels = [
 | 
				
			||||||
@ -785,6 +796,7 @@ def test_extractive_qa_eval_wrong_examples(reader, retriever_with_docs):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
					@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
				
			||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
					@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_extractive_qa_print_eval_report(reader, retriever_with_docs):
 | 
					def test_extractive_qa_print_eval_report(reader, retriever_with_docs):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    labels = [
 | 
					    labels = [
 | 
				
			||||||
@ -885,6 +897,7 @@ def test_faq_calculate_metrics(retriever_with_docs):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
					@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
				
			||||||
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
					@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_extractive_qa_eval_translation(reader, retriever_with_docs):
 | 
					def test_extractive_qa_eval_translation(reader, retriever_with_docs):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # FIXME it makes no sense to have DE->EN input and DE->EN output, right?
 | 
					    # FIXME it makes no sense to have DE->EN input and DE->EN output, right?
 | 
				
			||||||
@ -1017,8 +1030,7 @@ def test_qa_multi_retriever_pipeline_eval(document_store_with_docs, reader):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
 | 
					@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
 | 
				
			||||||
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
					def test_multi_retriever_pipeline_eval(document_store_with_docs):
 | 
				
			||||||
def test_multi_retriever_pipeline_eval(document_store_with_docs, reader):
 | 
					 | 
				
			||||||
    es_retriever = BM25Retriever(document_store=document_store_with_docs)
 | 
					    es_retriever = BM25Retriever(document_store=document_store_with_docs)
 | 
				
			||||||
    dpr_retriever = DensePassageRetriever(document_store_with_docs)
 | 
					    dpr_retriever = DensePassageRetriever(document_store_with_docs)
 | 
				
			||||||
    document_store_with_docs.update_embeddings(retriever=dpr_retriever)
 | 
					    document_store_with_docs.update_embeddings(retriever=dpr_retriever)
 | 
				
			||||||
 | 
				
			|||||||
@ -64,6 +64,7 @@ def test_extractive_qa_answers_single_result(reader, retriever_with_docs):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@pytest.mark.slow
 | 
					@pytest.mark.slow
 | 
				
			||||||
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
					@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("reader", ["farm"], indirect=True)
 | 
				
			||||||
def test_extractive_qa_answers_with_translator(reader, retriever_with_docs, en_to_de_translator, de_to_en_translator):
 | 
					def test_extractive_qa_answers_with_translator(reader, retriever_with_docs, en_to_de_translator, de_to_en_translator):
 | 
				
			||||||
    base_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
					    base_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
 | 
				
			||||||
    pipeline = TranslationWrapperPipeline(
 | 
					    pipeline = TranslationWrapperPipeline(
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user