mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 01:39:45 +00:00 
			
		
		
		
	 2531c8e061
			
		
	
	
		2531c8e061
		
			
		
	
	
	
	
		
			
			* add time and perf benchmark for es * Add retriever benchmarking * Add Reader benchmarking * add nq to squad conversion * add conversion stats * clean benchmarks * Add link to dataset * Update imports * add first support for neg psgs * Refactor test * set max_seq_len * cleanup benchmark * begin retriever speed benchmarking * Add support for retriever query index benchmarking * improve reader eval, retriever speed benchmarking * improve retriever speed benchmarking * Add retriever accuracy benchmark * Add neg doc shuffling * Add top_n * 3x speedup of SQL. add postgres docker run. make shuffle neg a param. add more logging * Add models to sweep * add option for faiss index type * remove unneeded line * change faiss to faiss_flat * begin automatic benchmark script * remove existing postgres docker for benchmarking * Add data processing scripts * Remove shuffle in script bc data already shuffled * switch hnsw setup from 256 to 128 * change es similarity to dot product by default * Error includes stack trace * Change ES default timeout * remove delete_docs() from timing for indexing * Add support for website export * update website on push to benchmarks * add complete benchmarks results * new json format * removed NaN as is not a valid json token * versioning for docs * unsaved changes * cleaning * cleaning * Edit format of benchmarks data * update also jsons in v0.4.0 Co-authored-by: brandenchan <brandenchan@icloud.com> Co-authored-by: deepset <deepset@Crenolape.localdomain> Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
		
			
				
	
	
		
			99 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			99 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import json
 | |
| import pandas as pd
 | |
| from pprint import pprint
 | |
| 
 | |
| def reader():
 | |
| 
 | |
|     model_rename_map = {
 | |
|         'deepset/roberta-base-squad2': "RoBERTa",
 | |
|         'deepset/minilm-uncased-squad2': "MiniLM",
 | |
|         'deepset/bert-base-cased-squad2': "BERT base",
 | |
|         'deepset/bert-large-uncased-whole-word-masking-squad2': "BERT large",
 | |
|         'deepset/xlm-roberta-large-squad2': "XLM-RoBERTa",
 | |
|     }
 | |
| 
 | |
|     column_name_map = {
 | |
|         "f1": "F1",
 | |
|         "passages_per_second": "Speed",
 | |
|         "reader": "Model"
 | |
|     }
 | |
| 
 | |
|     df = pd.read_csv("reader_results.csv")
 | |
|     df = df[["f1", "passages_per_second", "reader"]]
 | |
|     df["reader"] = df["reader"].map(model_rename_map)
 | |
|     df = df[list(column_name_map)]
 | |
|     df = df.rename(columns=column_name_map)
 | |
|     ret = [dict(row) for i, row in df.iterrows()]
 | |
|     print("Reader overview")
 | |
|     print(json.dumps(ret, indent=2))
 | |
| 
 | |
| def retriever():
 | |
| 
 | |
| 
 | |
|     column_name_map = {
 | |
|         "model": "model",
 | |
|         "n_docs": "n_docs",
 | |
|         "docs_per_second": "index_speed",
 | |
|         "queries_per_second": "query_speed",
 | |
|         "map": "map"
 | |
|     }
 | |
| 
 | |
|     name_cleaning = {
 | |
|         "dpr": "DPR",
 | |
|         "elastic": "BM25",
 | |
|         "elasticsearch": "ElasticSearch",
 | |
|         "faiss": "FAISS",
 | |
|         "faiss_flat": "FAISS (flat)",
 | |
|         "faiss_hnsw": "FAISS (HSNW)"
 | |
|     }
 | |
| 
 | |
|     index = pd.read_csv("retriever_index_results.csv")
 | |
|     query = pd.read_csv("retriever_query_results.csv")
 | |
|     df = pd.merge(index, query,
 | |
|                   how="right",
 | |
|                   left_on=["retriever", "doc_store", "n_docs"],
 | |
|                   right_on=["retriever", "doc_store", "n_docs"])
 | |
| 
 | |
|     df["retriever"] = df["retriever"].map(name_cleaning)
 | |
|     df["doc_store"] = df["doc_store"].map(name_cleaning)
 | |
|     df["model"] = df["retriever"] + " / " + df["doc_store"]
 | |
| 
 | |
|     df = df[list(column_name_map)]
 | |
|     df = df.rename(columns=column_name_map)
 | |
| 
 | |
|     print("Retriever overview")
 | |
|     print(retriever_overview(df))
 | |
| 
 | |
|     print("Retriever MAP")
 | |
|     print(retriever_map(df))
 | |
| 
 | |
|     print("Retriever Speed")
 | |
|     print(retriever_speed(df))
 | |
| 
 | |
| 
 | |
| def retriever_map(df):
 | |
|     columns = ["model", "n_docs", "map"]
 | |
|     df = df[columns]
 | |
|     ret = df.to_dict(orient="records")
 | |
|     return json.dumps(ret, indent=4)
 | |
| 
 | |
| 
 | |
| def retriever_speed(df):
 | |
|     columns = ["model", "n_docs", "query_speed"]
 | |
|     df = df[columns]
 | |
|     ret = df.to_dict(orient="records")
 | |
|     return json.dumps(ret, indent=4)
 | |
| 
 | |
| 
 | |
| 
 | |
| def retriever_overview(df, chosen_n_docs=100_000):
 | |
| 
 | |
|     df = df[df["n_docs"] == chosen_n_docs]
 | |
|     ret = [dict(row) for i, row in df.iterrows()]
 | |
| 
 | |
|     return json.dumps(ret, indent=2)
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     reader()
 | |
|     retriever() |