mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 17:59:27 +00:00 
			
		
		
		
	Remove wrong retriever top_1 metrics from print_eval_report (#2510)
				
					
				
			* remove wrong retriever top_1 metrics * Update Documentation & Code Style * don't show wrong examples frame when n_wrong_examples is 0 * Update Documentation & Code Style * Update Documentation & Code Style * only use farm reader during eval tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									738e008020
								
							
						
					
					
						commit
						771ed0bb1d
					
				| @ -75,6 +75,34 @@ | ||||
|         } | ||||
|       ] | ||||
|     }, | ||||
|     { | ||||
|       "allOf": [ | ||||
|         { | ||||
|           "properties": { | ||||
|             "version": { | ||||
|               "const": "1.4.0" | ||||
|             } | ||||
|           } | ||||
|         }, | ||||
|         { | ||||
|           "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.4.0.schema.json" | ||||
|         } | ||||
|       ] | ||||
|     }, | ||||
|     { | ||||
|       "allOf": [ | ||||
|         { | ||||
|           "properties": { | ||||
|             "version": { | ||||
|               "const": "1.4.0" | ||||
|             } | ||||
|           } | ||||
|         }, | ||||
|         { | ||||
|           "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.4.0.schema.json" | ||||
|         } | ||||
|       ] | ||||
|     }, | ||||
|     { | ||||
|       "allOf": [ | ||||
|         { | ||||
|  | ||||
| @ -179,9 +179,13 @@ def print_eval_report( | ||||
|         logger.warning("Pipelines with junctions are currently not supported.") | ||||
|         return | ||||
| 
 | ||||
|     answer_nodes = {node for node, df in eval_result.node_results.items() if len(df[df["type"] == "answer"]) > 0} | ||||
|     all_top_1_metrics = eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, simulated_top_k_reader=1) | ||||
|     answer_top_1_metrics = {node: metrics for node, metrics in all_top_1_metrics.items() if node in answer_nodes} | ||||
| 
 | ||||
|     calculated_metrics = { | ||||
|         "": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col), | ||||
|         "_top_1": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, simulated_top_k_reader=1), | ||||
|         "_top_1": answer_top_1_metrics, | ||||
|         " upper bound": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, eval_mode="isolated"), | ||||
|     } | ||||
| 
 | ||||
| @ -242,7 +246,9 @@ def _format_wrong_examples_report(eval_result: EvaluationResult, n_wrong_example | ||||
|         node: eval_result.wrong_examples(node, doc_relevance_col="gold_id_or_answer_match", n=n_wrong_examples) | ||||
|         for node in eval_result.node_results.keys() | ||||
|     } | ||||
|     examples_formatted = {node: "\n".join(map(_format_wrong_example, examples)) for node, examples in examples.items()} | ||||
|     examples_formatted = { | ||||
|         node: "\n".join(map(_format_wrong_example, examples)) for node, examples in examples.items() if any(examples) | ||||
|     } | ||||
| 
 | ||||
|     return "\n".join(map(_format_wrong_examples_node, examples_formatted.keys(), examples_formatted.values())) | ||||
| 
 | ||||
|  | ||||
| @ -305,6 +305,7 @@ EVAL_LABELS = [ | ||||
| 
 | ||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path): | ||||
|     labels = EVAL_LABELS[:1] | ||||
| 
 | ||||
| @ -357,6 +358,7 @@ def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path): | ||||
| 
 | ||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_extractive_qa_eval_multiple_queries(reader, retriever_with_docs, tmp_path): | ||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||
|     eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) | ||||
| @ -429,6 +431,7 @@ def test_extractive_qa_eval_multiple_queries(reader, retriever_with_docs, tmp_pa | ||||
| 
 | ||||
| @pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True) | ||||
| @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_extractive_qa_labels_with_filters(reader, retriever_with_docs, tmp_path): | ||||
|     labels = [ | ||||
|         # MultiLabel with filter that selects only the document about Carla | ||||
| @ -498,6 +501,7 @@ def test_extractive_qa_labels_with_filters(reader, retriever_with_docs, tmp_path | ||||
| 
 | ||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_extractive_qa_eval_sas(reader, retriever_with_docs): | ||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||
|     eval_result: EvaluationResult = pipeline.eval( | ||||
| @ -520,6 +524,7 @@ def test_extractive_qa_eval_sas(reader, retriever_with_docs): | ||||
|     assert metrics["Reader"]["sas"] == pytest.approx(1.0) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_reader_eval_in_pipeline(reader): | ||||
|     pipeline = Pipeline() | ||||
|     pipeline.add_node(component=reader, name="Reader", inputs=["Query"]) | ||||
| @ -537,6 +542,7 @@ def test_reader_eval_in_pipeline(reader): | ||||
| 
 | ||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_extractive_qa_eval_doc_relevance_col(reader, retriever_with_docs): | ||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||
|     eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) | ||||
| @ -553,6 +559,7 @@ def test_extractive_qa_eval_doc_relevance_col(reader, retriever_with_docs): | ||||
| 
 | ||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs): | ||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||
|     eval_result: EvaluationResult = pipeline.eval( | ||||
| @ -600,6 +607,7 @@ def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs): | ||||
| 
 | ||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_docs): | ||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||
|     eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) | ||||
| @ -651,6 +659,7 @@ def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_doc | ||||
| 
 | ||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriever_with_docs): | ||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||
|     eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 10}}) | ||||
| @ -709,6 +718,7 @@ def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriev | ||||
| 
 | ||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_extractive_qa_eval_isolated(reader, retriever_with_docs): | ||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||
|     eval_result: EvaluationResult = pipeline.eval( | ||||
| @ -738,6 +748,7 @@ def test_extractive_qa_eval_isolated(reader, retriever_with_docs): | ||||
| 
 | ||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_extractive_qa_eval_wrong_examples(reader, retriever_with_docs): | ||||
| 
 | ||||
|     labels = [ | ||||
| @ -785,6 +796,7 @@ def test_extractive_qa_eval_wrong_examples(reader, retriever_with_docs): | ||||
| 
 | ||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_extractive_qa_print_eval_report(reader, retriever_with_docs): | ||||
| 
 | ||||
|     labels = [ | ||||
| @ -885,6 +897,7 @@ def test_faq_calculate_metrics(retriever_with_docs): | ||||
| 
 | ||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_extractive_qa_eval_translation(reader, retriever_with_docs): | ||||
| 
 | ||||
|     # FIXME it makes no sense to have DE->EN input and DE->EN output, right? | ||||
| @ -1017,8 +1030,7 @@ def test_qa_multi_retriever_pipeline_eval(document_store_with_docs, reader): | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_multi_retriever_pipeline_eval(document_store_with_docs, reader): | ||||
| def test_multi_retriever_pipeline_eval(document_store_with_docs): | ||||
|     es_retriever = BM25Retriever(document_store=document_store_with_docs) | ||||
|     dpr_retriever = DensePassageRetriever(document_store_with_docs) | ||||
|     document_store_with_docs.update_embeddings(retriever=dpr_retriever) | ||||
|  | ||||
| @ -64,6 +64,7 @@ def test_extractive_qa_answers_single_result(reader, retriever_with_docs): | ||||
| 
 | ||||
| @pytest.mark.slow | ||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||
| def test_extractive_qa_answers_with_translator(reader, retriever_with_docs, en_to_de_translator, de_to_en_translator): | ||||
|     base_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||
|     pipeline = TranslationWrapperPipeline( | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 tstadel
						tstadel