mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 01:39:45 +00:00 
			
		
		
		
	Remove wrong retriever top_1 metrics from print_eval_report (#2510)
				
					
				
			* remove wrong retriever top_1 metrics * Update Documentation & Code Style * don't show wrong examples frame when n_wrong_examples is 0 * Update Documentation & Code Style * Update Documentation & Code Style * only use farm reader during eval tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									738e008020
								
							
						
					
					
						commit
						771ed0bb1d
					
				| @ -75,6 +75,34 @@ | |||||||
|         } |         } | ||||||
|       ] |       ] | ||||||
|     }, |     }, | ||||||
|  |     { | ||||||
|  |       "allOf": [ | ||||||
|  |         { | ||||||
|  |           "properties": { | ||||||
|  |             "version": { | ||||||
|  |               "const": "1.4.0" | ||||||
|  |             } | ||||||
|  |           } | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |           "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.4.0.schema.json" | ||||||
|  |         } | ||||||
|  |       ] | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "allOf": [ | ||||||
|  |         { | ||||||
|  |           "properties": { | ||||||
|  |             "version": { | ||||||
|  |               "const": "1.4.0" | ||||||
|  |             } | ||||||
|  |           } | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |           "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.4.0.schema.json" | ||||||
|  |         } | ||||||
|  |       ] | ||||||
|  |     }, | ||||||
|     { |     { | ||||||
|       "allOf": [ |       "allOf": [ | ||||||
|         { |         { | ||||||
|  | |||||||
| @ -179,9 +179,13 @@ def print_eval_report( | |||||||
|         logger.warning("Pipelines with junctions are currently not supported.") |         logger.warning("Pipelines with junctions are currently not supported.") | ||||||
|         return |         return | ||||||
| 
 | 
 | ||||||
|  |     answer_nodes = {node for node, df in eval_result.node_results.items() if len(df[df["type"] == "answer"]) > 0} | ||||||
|  |     all_top_1_metrics = eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, simulated_top_k_reader=1) | ||||||
|  |     answer_top_1_metrics = {node: metrics for node, metrics in all_top_1_metrics.items() if node in answer_nodes} | ||||||
|  | 
 | ||||||
|     calculated_metrics = { |     calculated_metrics = { | ||||||
|         "": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col), |         "": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col), | ||||||
|         "_top_1": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, simulated_top_k_reader=1), |         "_top_1": answer_top_1_metrics, | ||||||
|         " upper bound": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, eval_mode="isolated"), |         " upper bound": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, eval_mode="isolated"), | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
| @ -242,7 +246,9 @@ def _format_wrong_examples_report(eval_result: EvaluationResult, n_wrong_example | |||||||
|         node: eval_result.wrong_examples(node, doc_relevance_col="gold_id_or_answer_match", n=n_wrong_examples) |         node: eval_result.wrong_examples(node, doc_relevance_col="gold_id_or_answer_match", n=n_wrong_examples) | ||||||
|         for node in eval_result.node_results.keys() |         for node in eval_result.node_results.keys() | ||||||
|     } |     } | ||||||
|     examples_formatted = {node: "\n".join(map(_format_wrong_example, examples)) for node, examples in examples.items()} |     examples_formatted = { | ||||||
|  |         node: "\n".join(map(_format_wrong_example, examples)) for node, examples in examples.items() if any(examples) | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     return "\n".join(map(_format_wrong_examples_node, examples_formatted.keys(), examples_formatted.values())) |     return "\n".join(map(_format_wrong_examples_node, examples_formatted.keys(), examples_formatted.values())) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -305,6 +305,7 @@ EVAL_LABELS = [ | |||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path): | def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path): | ||||||
|     labels = EVAL_LABELS[:1] |     labels = EVAL_LABELS[:1] | ||||||
| 
 | 
 | ||||||
| @ -357,6 +358,7 @@ def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path): | |||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_extractive_qa_eval_multiple_queries(reader, retriever_with_docs, tmp_path): | def test_extractive_qa_eval_multiple_queries(reader, retriever_with_docs, tmp_path): | ||||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) |     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||||
|     eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) |     eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) | ||||||
| @ -429,6 +431,7 @@ def test_extractive_qa_eval_multiple_queries(reader, retriever_with_docs, tmp_pa | |||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True) | @pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True) | ||||||
| @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) | @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_extractive_qa_labels_with_filters(reader, retriever_with_docs, tmp_path): | def test_extractive_qa_labels_with_filters(reader, retriever_with_docs, tmp_path): | ||||||
|     labels = [ |     labels = [ | ||||||
|         # MultiLabel with filter that selects only the document about Carla |         # MultiLabel with filter that selects only the document about Carla | ||||||
| @ -498,6 +501,7 @@ def test_extractive_qa_labels_with_filters(reader, retriever_with_docs, tmp_path | |||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_extractive_qa_eval_sas(reader, retriever_with_docs): | def test_extractive_qa_eval_sas(reader, retriever_with_docs): | ||||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) |     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||||
|     eval_result: EvaluationResult = pipeline.eval( |     eval_result: EvaluationResult = pipeline.eval( | ||||||
| @ -520,6 +524,7 @@ def test_extractive_qa_eval_sas(reader, retriever_with_docs): | |||||||
|     assert metrics["Reader"]["sas"] == pytest.approx(1.0) |     assert metrics["Reader"]["sas"] == pytest.approx(1.0) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_reader_eval_in_pipeline(reader): | def test_reader_eval_in_pipeline(reader): | ||||||
|     pipeline = Pipeline() |     pipeline = Pipeline() | ||||||
|     pipeline.add_node(component=reader, name="Reader", inputs=["Query"]) |     pipeline.add_node(component=reader, name="Reader", inputs=["Query"]) | ||||||
| @ -537,6 +542,7 @@ def test_reader_eval_in_pipeline(reader): | |||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_extractive_qa_eval_doc_relevance_col(reader, retriever_with_docs): | def test_extractive_qa_eval_doc_relevance_col(reader, retriever_with_docs): | ||||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) |     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||||
|     eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) |     eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) | ||||||
| @ -553,6 +559,7 @@ def test_extractive_qa_eval_doc_relevance_col(reader, retriever_with_docs): | |||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs): | def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs): | ||||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) |     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||||
|     eval_result: EvaluationResult = pipeline.eval( |     eval_result: EvaluationResult = pipeline.eval( | ||||||
| @ -600,6 +607,7 @@ def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs): | |||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_docs): | def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_docs): | ||||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) |     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||||
|     eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) |     eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}}) | ||||||
| @ -651,6 +659,7 @@ def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_doc | |||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriever_with_docs): | def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriever_with_docs): | ||||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) |     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||||
|     eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 10}}) |     eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 10}}) | ||||||
| @ -709,6 +718,7 @@ def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriev | |||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_extractive_qa_eval_isolated(reader, retriever_with_docs): | def test_extractive_qa_eval_isolated(reader, retriever_with_docs): | ||||||
|     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) |     pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||||
|     eval_result: EvaluationResult = pipeline.eval( |     eval_result: EvaluationResult = pipeline.eval( | ||||||
| @ -738,6 +748,7 @@ def test_extractive_qa_eval_isolated(reader, retriever_with_docs): | |||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_extractive_qa_eval_wrong_examples(reader, retriever_with_docs): | def test_extractive_qa_eval_wrong_examples(reader, retriever_with_docs): | ||||||
| 
 | 
 | ||||||
|     labels = [ |     labels = [ | ||||||
| @ -785,6 +796,7 @@ def test_extractive_qa_eval_wrong_examples(reader, retriever_with_docs): | |||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_extractive_qa_print_eval_report(reader, retriever_with_docs): | def test_extractive_qa_print_eval_report(reader, retriever_with_docs): | ||||||
| 
 | 
 | ||||||
|     labels = [ |     labels = [ | ||||||
| @ -885,6 +897,7 @@ def test_faq_calculate_metrics(retriever_with_docs): | |||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||||
| @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | @pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True) | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_extractive_qa_eval_translation(reader, retriever_with_docs): | def test_extractive_qa_eval_translation(reader, retriever_with_docs): | ||||||
| 
 | 
 | ||||||
|     # FIXME it makes no sense to have DE->EN input and DE->EN output, right? |     # FIXME it makes no sense to have DE->EN input and DE->EN output, right? | ||||||
| @ -1017,8 +1030,7 @@ def test_qa_multi_retriever_pipeline_eval(document_store_with_docs, reader): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) | @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) | ||||||
| @pytest.mark.parametrize("reader", ["farm"], indirect=True) | def test_multi_retriever_pipeline_eval(document_store_with_docs): | ||||||
| def test_multi_retriever_pipeline_eval(document_store_with_docs, reader): |  | ||||||
|     es_retriever = BM25Retriever(document_store=document_store_with_docs) |     es_retriever = BM25Retriever(document_store=document_store_with_docs) | ||||||
|     dpr_retriever = DensePassageRetriever(document_store_with_docs) |     dpr_retriever = DensePassageRetriever(document_store_with_docs) | ||||||
|     document_store_with_docs.update_embeddings(retriever=dpr_retriever) |     document_store_with_docs.update_embeddings(retriever=dpr_retriever) | ||||||
|  | |||||||
| @ -64,6 +64,7 @@ def test_extractive_qa_answers_single_result(reader, retriever_with_docs): | |||||||
| 
 | 
 | ||||||
| @pytest.mark.slow | @pytest.mark.slow | ||||||
| @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) | ||||||
|  | @pytest.mark.parametrize("reader", ["farm"], indirect=True) | ||||||
| def test_extractive_qa_answers_with_translator(reader, retriever_with_docs, en_to_de_translator, de_to_en_translator): | def test_extractive_qa_answers_with_translator(reader, retriever_with_docs, en_to_de_translator, de_to_en_translator): | ||||||
|     base_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) |     base_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) | ||||||
|     pipeline = TranslationWrapperPipeline( |     pipeline = TranslationWrapperPipeline( | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 tstadel
						tstadel