mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-25 09:50:14 +00:00

* Testing black on ui/ * Applying black on docstores * Add latest docstring and tutorial changes * Create a single GH action for Black and docs to reduce commit noise to the minimum, slightly refactor the OpenAPI action too * Remove comments * Relax constraints on pydoc-markdown * Split temporary black from the docs. Pydoc-markdown was obsolete and needs a separate PR to upgrade * Fix a couple of bugs * Add a type: ignore that was missing somehow * Give path to black * Apply Black * Apply Black * Relocate a couple of type: ignore * Update documentation * Make Linux CI run after applying Black * Triggering Black * Apply Black * Remove dependency, does not work well * Remove manually double trailing commas * Update documentation Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
106 lines
3.0 KiB
Python
106 lines
3.0 KiB
Python
import json
|
|
import pandas as pd
|
|
from pprint import pprint
|
|
|
|
|
|
def reader(reader_csv="reader_results.csv"):
|
|
model_rename_map = {
|
|
"deepset/roberta-base-squad2": "RoBERTa",
|
|
"deepset/minilm-uncased-squad2": "MiniLM",
|
|
"deepset/bert-base-cased-squad2": "BERT base",
|
|
"deepset/bert-large-uncased-whole-word-masking-squad2": "BERT large",
|
|
"deepset/xlm-roberta-large-squad2": "XLM-RoBERTa",
|
|
}
|
|
|
|
column_name_map = {"f1": "F1", "passages_per_second": "Speed", "reader": "Model"}
|
|
|
|
df = pd.read_csv(reader_csv)
|
|
df = df[["f1", "passages_per_second", "reader"]]
|
|
df["reader"] = df["reader"].map(model_rename_map)
|
|
df = df[list(column_name_map)]
|
|
df = df.rename(columns=column_name_map)
|
|
ret = [dict(row) for i, row in df.iterrows()]
|
|
print("Reader overview")
|
|
print(json.dumps(ret, indent=4))
|
|
return ret
|
|
|
|
|
|
def retriever(index_csv="retriever_index_results.csv", query_csv="retriever_query_results.csv"):
|
|
column_name_map = {
|
|
"model": "model",
|
|
"n_docs": "n_docs",
|
|
"docs_per_second": "index_speed",
|
|
"queries_per_second": "query_speed",
|
|
"map": "map",
|
|
}
|
|
|
|
name_cleaning = {
|
|
"dpr": "DPR",
|
|
"elastic": "BM25",
|
|
"elasticsearch": "Elasticsearch",
|
|
"faiss": "FAISS",
|
|
"faiss_flat": "FAISS (flat)",
|
|
"faiss_hnsw": "FAISS (HNSW)",
|
|
"milvus_flat": "Milvus (flat)",
|
|
"milvus_hnsw": "Milvus (HNSW)",
|
|
"sentence_transformers": "Sentence Transformers",
|
|
"opensearch_flat": "OpenSearch (flat)",
|
|
"opensearch_hnsw": "OpenSearch (HNSW)",
|
|
}
|
|
|
|
index = pd.read_csv(index_csv)
|
|
query = pd.read_csv(query_csv)
|
|
df = pd.merge(
|
|
index,
|
|
query,
|
|
how="right",
|
|
left_on=["retriever", "doc_store", "n_docs"],
|
|
right_on=["retriever", "doc_store", "n_docs"],
|
|
)
|
|
|
|
df["retriever"] = df["retriever"].map(name_cleaning)
|
|
df["doc_store"] = df["doc_store"].map(name_cleaning)
|
|
df["model"] = df["retriever"] + " / " + df["doc_store"]
|
|
|
|
df = df[list(column_name_map)]
|
|
df = df.rename(columns=column_name_map)
|
|
|
|
print("Retriever overview")
|
|
retriever_overview_data = retriever_overview(df)
|
|
print(json.dumps(retriever_overview_data, indent=4))
|
|
|
|
print("Retriever MAP")
|
|
retriever_map_data = retriever_map(df)
|
|
print(json.dumps(retriever_map_data, indent=4))
|
|
|
|
print("Retriever Speed")
|
|
retriever_speed_data = retriever_speed(df)
|
|
print(json.dumps(retriever_speed_data, indent=4))
|
|
|
|
return retriever_overview_data, retriever_map_data, retriever_speed_data
|
|
|
|
|
|
def retriever_map(df):
|
|
columns = ["model", "n_docs", "map"]
|
|
df = df[columns]
|
|
ret = df.to_dict(orient="records")
|
|
return ret
|
|
|
|
|
|
def retriever_speed(df):
|
|
columns = ["model", "n_docs", "query_speed"]
|
|
df = df[columns]
|
|
ret = df.to_dict(orient="records")
|
|
return ret
|
|
|
|
|
|
def retriever_overview(df, chosen_n_docs=100_000):
|
|
df = df[df["n_docs"] == chosen_n_docs]
|
|
ret = [dict(row) for i, row in df.iterrows()]
|
|
return ret
|
|
|
|
|
|
if __name__ == "__main__":
|
|
reader()
|
|
retriever()
|