haystack/test/benchmarks/results_to_json.py
Branden Chan 363be65a78
Implement OpenSearch ANN (#1225)
* Simplify ODES init

* Add arguments to ES init and create script

* Rename similarity_fn_name and add util fn

* Create OpenSearchDocumentStore

* Specify params of Open Search HNSW

* Add better argument handling

* Update opensearch index mapping

* Edit opensearch default port

* Fix HNSW mapping

* Force small HNSW params

* Implement auto start and stopping of document store services

* Fix starting and stopping of ds service

* Restore HNSW params

* Add opensearch query benchmarks

* Add write wait time

* Revert wait time

* Add timeout

* Update benchmarks

* Update benchmarks

* Update benchmarks json

* Update documentation

* Update documentation

* Fix similarity name

* Improve argument passing

* Improve stopping and starting of service
2021-07-26 10:52:52 +02:00

106 lines
3.1 KiB
Python

import json
import pandas as pd
from pprint import pprint
def reader(reader_csv="reader_results.csv"):
model_rename_map = {
'deepset/roberta-base-squad2': "RoBERTa",
'deepset/minilm-uncased-squad2': "MiniLM",
'deepset/bert-base-cased-squad2': "BERT base",
'deepset/bert-large-uncased-whole-word-masking-squad2': "BERT large",
'deepset/xlm-roberta-large-squad2': "XLM-RoBERTa",
}
column_name_map = {
"f1": "F1",
"passages_per_second": "Speed",
"reader": "Model"
}
df = pd.read_csv(reader_csv)
df = df[["f1", "passages_per_second", "reader"]]
df["reader"] = df["reader"].map(model_rename_map)
df = df[list(column_name_map)]
df = df.rename(columns=column_name_map)
ret = [dict(row) for i, row in df.iterrows()]
print("Reader overview")
print(json.dumps(ret, indent=4))
return ret
def retriever(index_csv="retriever_index_results.csv", query_csv="retriever_query_results.csv"):
column_name_map = {
"model": "model",
"n_docs": "n_docs",
"docs_per_second": "index_speed",
"queries_per_second": "query_speed",
"map": "map"
}
name_cleaning = {
"dpr": "DPR",
"elastic": "BM25",
"elasticsearch": "Elasticsearch",
"faiss": "FAISS",
"faiss_flat": "FAISS (flat)",
"faiss_hnsw": "FAISS (HNSW)",
"milvus_flat": "Milvus (flat)",
"milvus_hnsw": "Milvus (HNSW)",
"sentence_transformers": "Sentence Transformers",
"opensearch_flat": "OpenSearch (flat)",
"opensearch_hnsw": "OpenSearch (HNSW)"
}
index = pd.read_csv(index_csv)
query = pd.read_csv(query_csv)
df = pd.merge(index, query,
how="right",
left_on=["retriever", "doc_store", "n_docs"],
right_on=["retriever", "doc_store", "n_docs"])
df["retriever"] = df["retriever"].map(name_cleaning)
df["doc_store"] = df["doc_store"].map(name_cleaning)
df["model"] = df["retriever"] + " / " + df["doc_store"]
df = df[list(column_name_map)]
df = df.rename(columns=column_name_map)
print("Retriever overview")
retriever_overview_data = retriever_overview(df)
print(json.dumps(retriever_overview_data, indent=4))
print("Retriever MAP")
retriever_map_data = retriever_map(df)
print(json.dumps(retriever_map_data, indent=4))
print("Retriever Speed")
retriever_speed_data = retriever_speed(df)
print(json.dumps(retriever_speed_data, indent=4))
return retriever_overview_data, retriever_map_data, retriever_speed_data
def retriever_map(df):
columns = ["model", "n_docs", "map"]
df = df[columns]
ret = df.to_dict(orient="records")
return ret
def retriever_speed(df):
columns = ["model", "n_docs", "query_speed"]
df = df[columns]
ret = df.to_dict(orient="records")
return ret
def retriever_overview(df, chosen_n_docs=100_000):
df = df[df["n_docs"] == chosen_n_docs]
ret = [dict(row) for i, row in df.iterrows()]
return ret
if __name__ == "__main__":
reader()
retriever()