mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-26 09:26:25 +00:00
Merge branch 'master' into fix_website
This commit is contained in:
commit
3793205aa3
@ -121,6 +121,7 @@ class Finder:
|
||||
label_origin: str = "gold_label",
|
||||
top_k_retriever: int = 10,
|
||||
top_k_reader: int = 10,
|
||||
return_preds: bool = False,
|
||||
):
|
||||
"""
|
||||
Evaluation of the whole pipeline by first evaluating the Retriever and then evaluating the Reader on the result
|
||||
@ -165,6 +166,9 @@ class Finder:
|
||||
:type top_k_retriever: int
|
||||
:param top_k_reader: How many answers to return per question
|
||||
:type top_k_reader: int
|
||||
:param return_preds: Whether to add predictions in the returned dictionary. If True, the returned dictionary
|
||||
contains the keys "predictions" and "metrics".
|
||||
:type return_preds: bool
|
||||
"""
|
||||
|
||||
if not self.reader or not self.retriever:
|
||||
@ -205,6 +209,7 @@ class Finder:
|
||||
previous_return_no_answers = self.reader.return_no_answers
|
||||
self.reader.return_no_answers = True
|
||||
|
||||
predictions = []
|
||||
# extract answers
|
||||
reader_start_time = time.time()
|
||||
for q_idx, question_docs in enumerate(questions_with_docs):
|
||||
@ -215,8 +220,10 @@ class Finder:
|
||||
question_string = question.question
|
||||
docs = question_docs["docs"] # type: ignore
|
||||
single_reader_start = time.time()
|
||||
predicted_answers = self.reader.predict(question_string, docs, top_k=top_k_reader) # type: ignore
|
||||
predicted_answers = self.reader.predict(question_string, docs, top_k=top_k_reader) # type: ignore
|
||||
read_times.append(time.time() - single_reader_start)
|
||||
if return_preds:
|
||||
predictions.append(predicted_answers)
|
||||
counts = eval_counts_reader(question, predicted_answers, counts)
|
||||
|
||||
counts["number_of_has_answer"] = counts["correct_retrievals"] - counts["number_of_no_answer"]
|
||||
@ -240,7 +247,10 @@ class Finder:
|
||||
eval_results["avg_reader_time"] = mean(read_times)
|
||||
eval_results["total_finder_time"] = finder_total_time
|
||||
|
||||
return eval_results
|
||||
if return_preds:
|
||||
return {"metrics": eval_results, "predictions": predictions}
|
||||
else:
|
||||
return eval_results
|
||||
|
||||
def eval_batch(
|
||||
self,
|
||||
@ -249,7 +259,8 @@ class Finder:
|
||||
label_origin: str = "gold_label",
|
||||
top_k_retriever: int = 10,
|
||||
top_k_reader: int = 10,
|
||||
batch_size: int = 50
|
||||
batch_size: int = 50,
|
||||
return_preds: bool = False,
|
||||
):
|
||||
"""
|
||||
Evaluation of the whole pipeline by first evaluating the Retriever and then evaluating the Reader on the result
|
||||
@ -296,10 +307,13 @@ class Finder:
|
||||
:type top_k_reader: int
|
||||
:param batch_size: Number of samples per batch computed at once
|
||||
:type batch_size: int
|
||||
:param return_preds: Whether to add predictions in the returned dictionary. If True, the returned dictionary
|
||||
contains the keys "predictions" and "metrics".
|
||||
:type return_preds: bool
|
||||
"""
|
||||
|
||||
if not self.reader or not self.retriever:
|
||||
raise Exception("Finder needs to have a reader and retriever for the evalutaion.")
|
||||
raise Exception("Finder needs to have a reader and retriever for the evaluation.")
|
||||
|
||||
counts = defaultdict(float) # type: Dict[str, float]
|
||||
finder_start_time = time.time()
|
||||
@ -344,7 +358,10 @@ class Finder:
|
||||
logger.info(f"{number_of_questions - correct_retrievals} questions could not be answered due to the retriever.")
|
||||
logger.info(f"{correct_retrievals - counts['correct_readings_topk']} questions could not be answered due to the reader.")
|
||||
|
||||
return results
|
||||
if return_preds:
|
||||
return {"metrics": results, "predictions": predictions}
|
||||
else:
|
||||
return results
|
||||
|
||||
|
||||
def _retrieve_docs(self, questions: List[MultiLabel], top_k: int, doc_index: str):
|
||||
@ -364,6 +381,9 @@ class Finder:
|
||||
|
||||
@staticmethod
|
||||
def print_eval_results(finder_eval_results: Dict):
|
||||
if "predictions" in finder_eval_results.keys():
|
||||
finder_eval_results = finder_eval_results["metrics"]
|
||||
|
||||
print("\n___Retriever Metrics in Finder___")
|
||||
print(f"Retriever Recall : {finder_eval_results['retriever_recall']:.3f}")
|
||||
print(f"Retriever Mean Avg Precision: {finder_eval_results['retriever_map']:.3f}")
|
||||
|
@ -19,12 +19,13 @@ from haystack.file_converter.txt import TextConverter
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]:
|
||||
def eval_data_from_file(filename: str, max_docs: Union[int, bool]=None) -> Tuple[List[Document], List[Label]]:
|
||||
"""
|
||||
Read Documents + Labels from a SQuAD-style file.
|
||||
Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
|
||||
|
||||
:param filename: Path to file in SQuAD format
|
||||
:param max_docs: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents.
|
||||
:return: (List of Documents, List of Labels)
|
||||
"""
|
||||
docs = []
|
||||
@ -34,7 +35,7 @@ def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]:
|
||||
data = json.load(file)
|
||||
if "title" not in data["data"][0]:
|
||||
logger.warning(f"No title information found for documents in QA file: {filename}")
|
||||
for document in data["data"]:
|
||||
for document in data["data"][:max_docs]:
|
||||
# get all extra fields from document level (e.g. title)
|
||||
meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")}
|
||||
for paragraph in document["paragraphs"]:
|
||||
|
@ -291,7 +291,7 @@ class FARMReader(BaseReader):
|
||||
result = []
|
||||
for idx, group in enumerate(grouped_predictions):
|
||||
answers, max_no_ans_gap = self._extract_answers_of_predictions(group, top_k_per_question)
|
||||
question = group[0]
|
||||
question = group[0].question
|
||||
cur_label = labels[idx]
|
||||
result.append({
|
||||
"question": question,
|
||||
|
@ -45,7 +45,8 @@ class BaseRetriever(ABC):
|
||||
doc_index: str = "eval_document",
|
||||
label_origin: str = "gold_label",
|
||||
top_k: int = 10,
|
||||
open_domain: bool = False
|
||||
open_domain: bool = False,
|
||||
return_preds: bool = False,
|
||||
) -> dict:
|
||||
"""
|
||||
Performs evaluation on the Retriever.
|
||||
@ -65,6 +66,8 @@ class BaseRetriever(ABC):
|
||||
contained in the retrieved docs (common approach in open-domain QA).
|
||||
If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids
|
||||
are within ids explicitly stated in the labels.
|
||||
:param return_preds: Whether to add predictions in the returned dictionary. If True, the returned dictionary
|
||||
contains the keys "predictions" and "metrics".
|
||||
"""
|
||||
|
||||
# Extract all questions for evaluation
|
||||
@ -86,11 +89,15 @@ class BaseRetriever(ABC):
|
||||
deduplicated_doc_ids = list(set([str(x) for x in label.multiple_document_ids]))
|
||||
question_label_dict[label.question] = deduplicated_doc_ids
|
||||
|
||||
predictions = []
|
||||
|
||||
# Option 1: Open-domain evaluation by checking if the answer string is in the retrieved docs
|
||||
logger.info("Performing eval queries...")
|
||||
if open_domain:
|
||||
for question, gold_answers in tqdm(question_label_dict.items()):
|
||||
retrieved_docs = timed_retrieve(question, top_k=top_k, index=doc_index)
|
||||
if return_preds:
|
||||
predictions.append({"question": question, "retrieved_docs": retrieved_docs})
|
||||
# check if correct doc in retrieved docs
|
||||
for doc_idx, doc in enumerate(retrieved_docs):
|
||||
for gold_answer in gold_answers:
|
||||
@ -102,6 +109,8 @@ class BaseRetriever(ABC):
|
||||
else:
|
||||
for question, gold_ids in tqdm(question_label_dict.items()):
|
||||
retrieved_docs = timed_retrieve(question, top_k=top_k, index=doc_index)
|
||||
if return_preds:
|
||||
predictions.append({"question": question, "retrieved_docs": retrieved_docs})
|
||||
# check if correct doc in retrieved docs
|
||||
for doc_idx, doc in enumerate(retrieved_docs):
|
||||
for gold_id in gold_ids:
|
||||
@ -117,4 +126,15 @@ class BaseRetriever(ABC):
|
||||
logger.info((f"For {correct_retrievals} out of {number_of_questions} questions ({recall:.2%}), the answer was in"
|
||||
f" the top-{top_k} candidate passages selected by the retriever."))
|
||||
|
||||
return {"recall": recall, "map": mean_avg_precision, "retrieve_time": self.retrieve_time, "n_questions": number_of_questions, "top_k": top_k}
|
||||
metrics = {
|
||||
"recall": recall,
|
||||
"map": mean_avg_precision,
|
||||
"retrieve_time": self.retrieve_time,
|
||||
"n_questions": number_of_questions,
|
||||
"top_k": top_k
|
||||
}
|
||||
|
||||
if return_preds:
|
||||
return {"metrics": metrics, "predictions": predictions}
|
||||
else:
|
||||
return metrics
|
20
test/benchmarks/README.md
Normal file
20
test/benchmarks/README.md
Normal file
@ -0,0 +1,20 @@
|
||||
# Benchmarks
|
||||
|
||||
Run the benchmarks with the following command:
|
||||
|
||||
```
|
||||
python run.py [--reader] [--retriever_index] [--retriever_query] [--ci] [--update-json]
|
||||
```
|
||||
|
||||
You can specify which components and processes to benchmark with the following flags.
|
||||
|
||||
**--reader** will trigger the speed and accuracy benchmarks for the reader. Here we simply use the SQuAD dev set.
|
||||
|
||||
**--retriever_index** will trigger indexing benchmarks
|
||||
|
||||
**--retriever_query** will trigger querying benchmarks (embeddings will be loaded from file instead of being computed on the fly)
|
||||
|
||||
**--ci** will cause the the benchmarks to run on a smaller slice of each dataset and a smaller subset of Retriever / Reader / DocStores.
|
||||
|
||||
**--update-json** will cause the script to update the json files in docs/_src/benchmarks so that the website benchmarks will be updated.
|
||||
|
57
test/benchmarks/config.json
Normal file
57
test/benchmarks/config.json
Normal file
@ -0,0 +1,57 @@
|
||||
{
|
||||
"params": {
|
||||
"full": {
|
||||
"retriever_doc_stores": [
|
||||
[
|
||||
"elastic",
|
||||
"elasticsearch"
|
||||
],
|
||||
[
|
||||
"dpr",
|
||||
"elasticsearch"
|
||||
],
|
||||
[
|
||||
"dpr",
|
||||
"faiss_flat"
|
||||
],
|
||||
[
|
||||
"dpr",
|
||||
"faiss_hnsw"
|
||||
]
|
||||
],
|
||||
"n_docs_options": [
|
||||
10000,
|
||||
100000,
|
||||
500000
|
||||
],
|
||||
"n_queries": null
|
||||
},
|
||||
"ci": {
|
||||
"retriever_doc_stores": [
|
||||
[
|
||||
"elastic",
|
||||
"elasticsearch"
|
||||
]
|
||||
],
|
||||
"n_docs_options": [
|
||||
1000
|
||||
],
|
||||
"n_queries": 10
|
||||
}
|
||||
},
|
||||
"filenames": {
|
||||
"data_s3_url": "s3://ext-haystack-retriever-eval/",
|
||||
"data_dir": "../../data/retriever/",
|
||||
"filename_gold": "nq2squad-dev.json",
|
||||
"filenames_negative": {
|
||||
"10000": "psgs_w100_minus_gold_10k.tsv",
|
||||
"100000": "psgs_w100_minus_gold_100k.tsv",
|
||||
"1000000": "psgs_w100_minus_gold_1m.tsv"
|
||||
},
|
||||
"embeddings_dir": "embeddings/",
|
||||
"embeddings_filenames": {
|
||||
"10000": "wikipedia_passages_10k.pkl",
|
||||
"100000": "wikipedia_passages_100k.pkl",
|
||||
"1000000": "wikipedia_passages_1m.pkl"}
|
||||
}
|
||||
}
|
@ -2,10 +2,15 @@ from utils import get_document_store, index_to_doc_store, get_reader
|
||||
from haystack.preprocessor.utils import eval_data_from_file
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from results_to_json import reader as reader_json
|
||||
from templates import READER_TEMPLATE
|
||||
import json
|
||||
|
||||
reader_models = ["deepset/roberta-base-squad2", "deepset/minilm-uncased-squad2",
|
||||
|
||||
reader_models_full = ["deepset/roberta-base-squad2", "deepset/minilm-uncased-squad2",
|
||||
"deepset/bert-base-cased-squad2", "deepset/bert-large-uncased-whole-word-masking-squad2",
|
||||
"deepset/xlm-roberta-large-squad2", "distilbert-base-uncased-distilled-squad"]
|
||||
reader_models_ci = ["deepset/minilm-uncased-squad2"]
|
||||
|
||||
reader_types = ["farm"]
|
||||
data_dir = Path("../../data/squad20")
|
||||
@ -14,13 +19,24 @@ filename = "dev-v2.0.json"
|
||||
# This number could vary when using a different tokenizer
|
||||
n_passages = 12350
|
||||
|
||||
results_file = "reader_results.csv"
|
||||
|
||||
reader_json_file = "../../docs/_src/benchmarks/reader_performance.json"
|
||||
|
||||
doc_index = "eval_document"
|
||||
label_index = "label"
|
||||
|
||||
def benchmark_reader():
|
||||
def benchmark_reader(ci=False, update_json=False, **kwargs):
|
||||
if ci:
|
||||
reader_models = reader_models_ci
|
||||
n_docs = 1
|
||||
else:
|
||||
reader_models = reader_models_full
|
||||
n_docs = None
|
||||
reader_results = []
|
||||
doc_store = get_document_store("elasticsearch")
|
||||
docs, labels = eval_data_from_file(data_dir/filename)
|
||||
docs, labels = eval_data_from_file(data_dir/filename, n_docs)
|
||||
|
||||
index_to_doc_store(doc_store, docs, None, labels)
|
||||
for reader_name in reader_models:
|
||||
for reader_type in reader_types:
|
||||
@ -47,8 +63,17 @@ def benchmark_reader():
|
||||
"error": e}
|
||||
reader_results.append(results)
|
||||
reader_df = pd.DataFrame.from_records(reader_results)
|
||||
reader_df.to_csv("reader_results.csv")
|
||||
reader_df.to_csv(results_file)
|
||||
if update_json:
|
||||
populate_reader_json()
|
||||
|
||||
|
||||
def populate_reader_json():
|
||||
reader_results = reader_json()
|
||||
template = READER_TEMPLATE
|
||||
template["data"] = reader_results
|
||||
json.dump(template, open(reader_json_file, "w"), indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
benchmark_reader()
|
||||
benchmark_reader(True, update_json=True)
|
@ -2,8 +2,8 @@ import json
|
||||
import pandas as pd
|
||||
from pprint import pprint
|
||||
|
||||
def reader():
|
||||
|
||||
def reader(reader_csv="reader_results.csv"):
|
||||
model_rename_map = {
|
||||
'deepset/roberta-base-squad2': "RoBERTa",
|
||||
'deepset/minilm-uncased-squad2': "MiniLM",
|
||||
@ -18,18 +18,18 @@ def reader():
|
||||
"reader": "Model"
|
||||
}
|
||||
|
||||
df = pd.read_csv("reader_results.csv")
|
||||
df = pd.read_csv(reader_csv)
|
||||
df = df[["f1", "passages_per_second", "reader"]]
|
||||
df["reader"] = df["reader"].map(model_rename_map)
|
||||
df = df[list(column_name_map)]
|
||||
df = df.rename(columns=column_name_map)
|
||||
ret = [dict(row) for i, row in df.iterrows()]
|
||||
print("Reader overview")
|
||||
print(json.dumps(ret, indent=2))
|
||||
|
||||
def retriever():
|
||||
print(json.dumps(ret, indent=4))
|
||||
return ret
|
||||
|
||||
|
||||
def retriever(index_csv="retriever_index_results.csv", query_csv="retriever_query_results.csv"):
|
||||
column_name_map = {
|
||||
"model": "model",
|
||||
"n_docs": "n_docs",
|
||||
@ -47,8 +47,8 @@ def retriever():
|
||||
"faiss_hnsw": "FAISS (HSNW)"
|
||||
}
|
||||
|
||||
index = pd.read_csv("retriever_index_results.csv")
|
||||
query = pd.read_csv("retriever_query_results.csv")
|
||||
index = pd.read_csv(index_csv)
|
||||
query = pd.read_csv(query_csv)
|
||||
df = pd.merge(index, query,
|
||||
how="right",
|
||||
left_on=["retriever", "doc_store", "n_docs"],
|
||||
@ -62,36 +62,38 @@ def retriever():
|
||||
df = df.rename(columns=column_name_map)
|
||||
|
||||
print("Retriever overview")
|
||||
print(retriever_overview(df))
|
||||
retriever_overview_data = retriever_overview(df)
|
||||
print(json.dumps(retriever_overview_data, indent=4))
|
||||
|
||||
print("Retriever MAP")
|
||||
print(retriever_map(df))
|
||||
retriever_map_data = retriever_map(df)
|
||||
print(json.dumps(retriever_map_data, indent=4))
|
||||
|
||||
print("Retriever Speed")
|
||||
print(retriever_speed(df))
|
||||
retriever_speed_data = retriever_speed(df)
|
||||
print(json.dumps(retriever_speed_data, indent=4))
|
||||
|
||||
return retriever_overview_data, retriever_map_data, retriever_speed_data
|
||||
|
||||
|
||||
def retriever_map(df):
|
||||
columns = ["model", "n_docs", "map"]
|
||||
df = df[columns]
|
||||
ret = df.to_dict(orient="records")
|
||||
return json.dumps(ret, indent=4)
|
||||
return ret
|
||||
|
||||
|
||||
def retriever_speed(df):
|
||||
columns = ["model", "n_docs", "query_speed"]
|
||||
df = df[columns]
|
||||
ret = df.to_dict(orient="records")
|
||||
return json.dumps(ret, indent=4)
|
||||
|
||||
return ret
|
||||
|
||||
|
||||
def retriever_overview(df, chosen_n_docs=100_000):
|
||||
|
||||
df = df[df["n_docs"] == chosen_n_docs]
|
||||
ret = [dict(row) for i, row in df.iterrows()]
|
||||
|
||||
return json.dumps(ret, indent=2)
|
||||
return ret
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -1,7 +1,7 @@
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from time import perf_counter
|
||||
from utils import get_document_store, get_retriever, index_to_doc_store
|
||||
from utils import get_document_store, get_retriever, index_to_doc_store, load_config
|
||||
from haystack.preprocessor.utils import eval_data_from_file
|
||||
from haystack import Document
|
||||
import pickle
|
||||
@ -11,142 +11,122 @@ import logging
|
||||
import datetime
|
||||
import random
|
||||
import traceback
|
||||
|
||||
import os
|
||||
import requests
|
||||
from farm.file_utils import download_from_s3
|
||||
import json
|
||||
from results_to_json import retriever as retriever_json
|
||||
from templates import RETRIEVER_TEMPLATE, RETRIEVER_MAP_TEMPLATE, RETRIEVER_SPEED_TEMPLATE
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.getLogger("haystack.retriever.base").setLevel(logging.WARN)
|
||||
logging.getLogger("elasticsearch").setLevel(logging.WARN)
|
||||
|
||||
es_similarity = "dot_product"
|
||||
|
||||
retriever_doc_stores = [
|
||||
# ("elastic", "elasticsearch"),
|
||||
# ("dpr", "elasticsearch"),
|
||||
# ("dpr", "faiss_flat"),
|
||||
("dpr", "faiss_hnsw")
|
||||
]
|
||||
|
||||
n_docs_options = [
|
||||
1000,
|
||||
10000,
|
||||
100000,
|
||||
500000,
|
||||
]
|
||||
|
||||
# If set to None, querying will be run on all queries
|
||||
n_queries = None
|
||||
data_dir = Path("../../data/retriever")
|
||||
filename_gold = "nq2squad-dev.json" # Found at s3://ext-haystack-retriever-eval
|
||||
filename_negative = "psgs_w100_minus_gold.tsv" # Found at s3://ext-haystack-retriever-eval
|
||||
embeddings_dir = Path("embeddings")
|
||||
embeddings_filenames = [f"wikipedia_passages_1m.pkl"] # Found at s3://ext-haystack-retriever-eval
|
||||
|
||||
doc_index = "eval_document"
|
||||
label_index = "label"
|
||||
|
||||
seed = 42
|
||||
index_results_file = "retriever_index_results.csv"
|
||||
query_results_file = "retriever_query_results.csv"
|
||||
|
||||
overview_json = "../../docs/_src/benchmarks/retriever_performance.json"
|
||||
map_json = "../../docs/_src/benchmarks/retriever_map.json"
|
||||
speed_json = "../../docs/_src/benchmarks/retriever_speed.json"
|
||||
|
||||
|
||||
seed = 42
|
||||
random.seed(42)
|
||||
|
||||
|
||||
def prepare_data(data_dir, filename_gold, filename_negative, n_docs=None, n_queries=None, add_precomputed=False):
|
||||
"""
|
||||
filename_gold points to a squad format file.
|
||||
filename_negative points to a csv file where the first column is doc_id and second is document text.
|
||||
If add_precomputed is True, this fn will look in the embeddings files for precomputed embeddings to add to each Document
|
||||
"""
|
||||
|
||||
gold_docs, labels = eval_data_from_file(data_dir / filename_gold)
|
||||
|
||||
# Reduce number of docs
|
||||
gold_docs = gold_docs[:n_docs]
|
||||
|
||||
# Remove labels whose gold docs have been removed
|
||||
doc_ids = [x.id for x in gold_docs]
|
||||
labels = [x for x in labels if x.document_id in doc_ids]
|
||||
|
||||
# Filter labels down to n_queries
|
||||
selected_queries = list(set(f"{x.document_id} | {x.question}" for x in labels))
|
||||
selected_queries = selected_queries[:n_queries]
|
||||
labels = [x for x in labels if f"{x.document_id} | {x.question}" in selected_queries]
|
||||
|
||||
n_neg_docs = max(0, n_docs - len(gold_docs))
|
||||
neg_docs = prepare_negative_passages(data_dir, filename_negative, n_neg_docs)
|
||||
docs = gold_docs + neg_docs
|
||||
|
||||
if add_precomputed:
|
||||
docs = add_precomputed_embeddings(data_dir / embeddings_dir, embeddings_filenames, docs)
|
||||
|
||||
return docs, labels
|
||||
|
||||
def prepare_negative_passages(data_dir, filename_negative, n_docs):
|
||||
if n_docs == 0:
|
||||
return []
|
||||
with open(data_dir / filename_negative) as f:
|
||||
lines = []
|
||||
_ = f.readline() # Skip column titles line
|
||||
for _ in range(n_docs):
|
||||
lines.append(f.readline()[:-1])
|
||||
|
||||
docs = []
|
||||
for l in lines[:n_docs]:
|
||||
id, text, title = l.split("\t")
|
||||
d = {"text": text,
|
||||
"meta": {"passage_id": int(id),
|
||||
"title": title}}
|
||||
d = Document(**d)
|
||||
docs.append(d)
|
||||
return docs
|
||||
|
||||
def benchmark_indexing():
|
||||
def benchmark_indexing(n_docs_options, retriever_doc_stores, data_dir, filename_gold, filename_negative, data_s3_url, embeddings_filenames, embeddings_dir, update_json, **kwargs):
|
||||
|
||||
retriever_results = []
|
||||
for n_docs in n_docs_options:
|
||||
for retriever_name, doc_store_name in retriever_doc_stores:
|
||||
doc_store = get_document_store(doc_store_name, es_similarity=es_similarity)
|
||||
logger.info(f"##### Start indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ")
|
||||
try:
|
||||
doc_store = get_document_store(doc_store_name)
|
||||
retriever = get_retriever(retriever_name, doc_store)
|
||||
docs, _ = prepare_data(data_dir=data_dir,
|
||||
filename_gold=filename_gold,
|
||||
filename_negative=filename_negative,
|
||||
data_s3_url=data_s3_url,
|
||||
embeddings_filenames=embeddings_filenames,
|
||||
embeddings_dir=embeddings_dir,
|
||||
n_docs=n_docs)
|
||||
|
||||
retriever = get_retriever(retriever_name, doc_store)
|
||||
tic = perf_counter()
|
||||
index_to_doc_store(doc_store, docs, retriever)
|
||||
toc = perf_counter()
|
||||
indexing_time = toc - tic
|
||||
|
||||
docs, _ = prepare_data(data_dir, filename_gold, filename_negative, n_docs=n_docs)
|
||||
print(indexing_time)
|
||||
|
||||
tic = perf_counter()
|
||||
index_to_doc_store(doc_store, docs, retriever)
|
||||
toc = perf_counter()
|
||||
indexing_time = toc - tic
|
||||
retriever_results.append({
|
||||
"retriever": retriever_name,
|
||||
"doc_store": doc_store_name,
|
||||
"n_docs": n_docs,
|
||||
"indexing_time": indexing_time,
|
||||
"docs_per_second": n_docs / indexing_time,
|
||||
"date_time": datetime.datetime.now(),
|
||||
"error": None})
|
||||
retriever_df = pd.DataFrame.from_records(retriever_results)
|
||||
retriever_df = retriever_df.sort_values(by="retriever").sort_values(by="doc_store")
|
||||
retriever_df.to_csv(index_results_file)
|
||||
doc_store.delete_all_documents(index=doc_index)
|
||||
doc_store.delete_all_documents(index=label_index)
|
||||
time.sleep(10)
|
||||
del doc_store
|
||||
del retriever
|
||||
|
||||
print(indexing_time)
|
||||
except Exception as e:
|
||||
tb = traceback.format_exc()
|
||||
logging.ERROR(f"##### The following Error was raised while running indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs #####")
|
||||
logging.Error(tb)
|
||||
retriever_results.append({
|
||||
"retriever": retriever_name,
|
||||
"doc_store": doc_store_name,
|
||||
"n_docs": n_docs,
|
||||
"indexing_time": 0,
|
||||
"docs_per_second": 0,
|
||||
"date_time": datetime.datetime.now(),
|
||||
"error": str(tb)})
|
||||
doc_store.delete_all_documents(index=doc_index)
|
||||
doc_store.delete_all_documents(index=label_index)
|
||||
time.sleep(10)
|
||||
del doc_store
|
||||
del retriever
|
||||
if update_json:
|
||||
populate_retriever_json()
|
||||
|
||||
retriever_results.append({
|
||||
"retriever": retriever_name,
|
||||
"doc_store": doc_store_name,
|
||||
"n_docs": n_docs,
|
||||
"indexing_time": indexing_time,
|
||||
"docs_per_second": n_docs / indexing_time,
|
||||
"date_time": datetime.datetime.now()})
|
||||
retriever_df = pd.DataFrame.from_records(retriever_results)
|
||||
retriever_df = retriever_df.sort_values(by="retriever").sort_values(by="doc_store")
|
||||
retriever_df.to_csv("retriever_index_results.csv")
|
||||
|
||||
doc_store.delete_all_documents(index=doc_index)
|
||||
doc_store.delete_all_documents(index=label_index)
|
||||
time.sleep(10)
|
||||
del doc_store
|
||||
del retriever
|
||||
|
||||
def benchmark_querying():
|
||||
def benchmark_querying(n_docs_options,
|
||||
retriever_doc_stores,
|
||||
data_dir,
|
||||
data_s3_url,
|
||||
filename_gold,
|
||||
filename_negative,
|
||||
n_queries,
|
||||
embeddings_filenames,
|
||||
embeddings_dir,
|
||||
update_json,
|
||||
**kwargs):
|
||||
""" Benchmark the time it takes to perform querying. Doc embeddings are loaded from file."""
|
||||
retriever_results = []
|
||||
|
||||
for n_docs in n_docs_options:
|
||||
for retriever_name, doc_store_name in retriever_doc_stores:
|
||||
try:
|
||||
logger.info(f"##### Start run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ")
|
||||
doc_store = get_document_store(doc_store_name, es_similarity=es_similarity)
|
||||
logger.info(f"##### Start querying run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ")
|
||||
doc_store = get_document_store(doc_store_name)
|
||||
retriever = get_retriever(retriever_name, doc_store)
|
||||
add_precomputed = retriever_name in ["dpr"]
|
||||
# For DPR, precomputed embeddings are loaded from file
|
||||
docs, labels = prepare_data(data_dir,
|
||||
filename_gold,
|
||||
filename_negative,
|
||||
docs, labels = prepare_data(data_dir=data_dir,
|
||||
filename_gold=filename_gold,
|
||||
filename_negative=filename_negative,
|
||||
data_s3_url=data_s3_url,
|
||||
embeddings_filenames=embeddings_filenames,
|
||||
embeddings_dir=embeddings_dir,
|
||||
n_docs=n_docs,
|
||||
n_queries=n_queries,
|
||||
add_precomputed=add_precomputed)
|
||||
@ -170,12 +150,15 @@ def benchmark_querying():
|
||||
"error": None
|
||||
}
|
||||
|
||||
doc_store.delete_all_documents()
|
||||
doc_store.delete_all_documents(index=doc_index)
|
||||
doc_store.delete_all_documents(index=label_index)
|
||||
time.sleep(5)
|
||||
del doc_store
|
||||
del retriever
|
||||
except Exception as e:
|
||||
tb = traceback.format_exc()
|
||||
logging.ERROR(f"##### The following Error was raised while running querying run: {retriever_name}, {doc_store_name}, {n_docs} docs #####")
|
||||
logging.Error(tb)
|
||||
results = {
|
||||
"retriever": retriever_name,
|
||||
"doc_store": doc_store_name,
|
||||
@ -190,22 +173,41 @@ def benchmark_querying():
|
||||
"date_time": datetime.datetime.now(),
|
||||
"error": str(tb)
|
||||
}
|
||||
doc_store.delete_all_documents(index=doc_index)
|
||||
doc_store.delete_all_documents(index=label_index)
|
||||
time.sleep(5)
|
||||
del doc_store
|
||||
del retriever
|
||||
logger.info(results)
|
||||
retriever_results.append(results)
|
||||
|
||||
retriever_df = pd.DataFrame.from_records(retriever_results)
|
||||
retriever_df = retriever_df.sort_values(by="retriever").sort_values(by="doc_store")
|
||||
retriever_df.to_csv("retriever_query_results.csv")
|
||||
retriever_df.to_csv(query_results_file)
|
||||
if update_json:
|
||||
populate_retriever_json()
|
||||
|
||||
|
||||
def populate_retriever_json():
|
||||
retriever_overview_data, retriever_map_data, retriever_speed_data = retriever_json(index_csv=index_results_file,
|
||||
query_csv=query_results_file)
|
||||
overview = RETRIEVER_TEMPLATE
|
||||
overview["data"] = retriever_overview_data
|
||||
map = RETRIEVER_MAP_TEMPLATE
|
||||
map["data"] = retriever_map_data
|
||||
speed = RETRIEVER_SPEED_TEMPLATE
|
||||
speed["data"] = retriever_speed_data
|
||||
json.dump(overview, open(overview_json, "w"), indent=4)
|
||||
json.dump(speed, open(speed_json, "w"), indent=4)
|
||||
json.dump(map, open(map_json, "w"), indent=4)
|
||||
|
||||
|
||||
def add_precomputed_embeddings(embeddings_dir, embeddings_filenames, docs):
|
||||
ret = []
|
||||
id_to_doc = {x.meta["passage_id"]: x for x in docs}
|
||||
for ef in embeddings_filenames:
|
||||
logger.info(f"Adding precomputed embeddings from {embeddings_dir / ef}")
|
||||
filename = embeddings_dir / ef
|
||||
logger.info(f"Adding precomputed embeddings from {embeddings_dir + ef}")
|
||||
filename = embeddings_dir + ef
|
||||
embeds = pickle.load(open(filename, "rb"))
|
||||
for i, vec in embeds:
|
||||
if int(i) in id_to_doc:
|
||||
@ -219,7 +221,66 @@ def add_precomputed_embeddings(embeddings_dir, embeddings_filenames, docs):
|
||||
return ret
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
benchmark_indexing()
|
||||
benchmark_querying()
|
||||
def prepare_data(data_dir, filename_gold, filename_negative, data_s3_url, embeddings_filenames, embeddings_dir, n_docs=None, n_queries=None, add_precomputed=False):
|
||||
"""
|
||||
filename_gold points to a squad format file.
|
||||
filename_negative points to a csv file where the first column is doc_id and second is document text.
|
||||
If add_precomputed is True, this fn will look in the embeddings files for precomputed embeddings to add to each Document
|
||||
"""
|
||||
|
||||
logging.getLogger("farm").setLevel(logging.INFO)
|
||||
download_from_s3(data_s3_url + filename_gold, cache_dir=data_dir)
|
||||
download_from_s3(data_s3_url + filename_negative, cache_dir=data_dir)
|
||||
if add_precomputed:
|
||||
for embedding_filename in embeddings_filenames:
|
||||
download_from_s3(data_s3_url + str(embeddings_dir) + embedding_filename, cache_dir=data_dir)
|
||||
logging.getLogger("farm").setLevel(logging.WARN)
|
||||
|
||||
gold_docs, labels = eval_data_from_file(data_dir + filename_gold)
|
||||
|
||||
# Reduce number of docs
|
||||
gold_docs = gold_docs[:n_docs]
|
||||
|
||||
# Remove labels whose gold docs have been removed
|
||||
doc_ids = [x.id for x in gold_docs]
|
||||
labels = [x for x in labels if x.document_id in doc_ids]
|
||||
|
||||
# Filter labels down to n_queries
|
||||
selected_queries = list(set(f"{x.document_id} | {x.question}" for x in labels))
|
||||
selected_queries = selected_queries[:n_queries]
|
||||
labels = [x for x in labels if f"{x.document_id} | {x.question}" in selected_queries]
|
||||
|
||||
n_neg_docs = max(0, n_docs - len(gold_docs))
|
||||
neg_docs = prepare_negative_passages(data_dir, filename_negative, n_neg_docs)
|
||||
docs = gold_docs + neg_docs
|
||||
|
||||
if add_precomputed:
|
||||
docs = add_precomputed_embeddings(data_dir + embeddings_dir, embeddings_filenames, docs)
|
||||
|
||||
return docs, labels
|
||||
|
||||
def prepare_negative_passages(data_dir, filename_negative, n_docs):
|
||||
if n_docs == 0:
|
||||
return []
|
||||
with open(data_dir + filename_negative) as f:
|
||||
lines = []
|
||||
_ = f.readline() # Skip column titles line
|
||||
for _ in range(n_docs):
|
||||
lines.append(f.readline()[:-1])
|
||||
|
||||
docs = []
|
||||
for l in lines[:n_docs]:
|
||||
id, text, title = l.split("\t")
|
||||
d = {"text": text,
|
||||
"meta": {"passage_id": int(id),
|
||||
"title": title}}
|
||||
d = Document(**d)
|
||||
docs.append(d)
|
||||
return docs
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
params, filenames = load_config(config_filename="config.json", ci=True)
|
||||
benchmark_indexing(**params, **filenames)
|
||||
benchmark_querying(**params, **filenames)
|
||||
|
||||
|
@ -1,7 +1,10 @@
|
||||
from retriever import benchmark_indexing, benchmark_querying
|
||||
from reader import benchmark_reader
|
||||
from utils import load_config
|
||||
import argparse
|
||||
|
||||
params, filenames = load_config(config_filename="config.json", ci=True)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('--reader', default=False, action="store_true",
|
||||
@ -12,13 +15,15 @@ parser.add_argument('--retriever_query', default=False, action="store_true",
|
||||
help='Perform Retriever querying benchmarks')
|
||||
parser.add_argument('--ci', default=False, action="store_true",
|
||||
help='Perform a smaller subset of benchmarks that are quicker to run')
|
||||
parser.add_argument('--update_json', default=False, action="store_true",
|
||||
help='Update the json file with the results of this run so that the website can be updated')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.retriever_index:
|
||||
benchmark_indexing(ci)
|
||||
benchmark_indexing(**params, **filenames, ci=args.ci, update_json=args.update_json)
|
||||
if args.retriever_query:
|
||||
benchmark_querying(ci)
|
||||
if args.retriever_reader:
|
||||
benchmark_reader(ci)
|
||||
benchmark_querying(**params, **filenames, ci=args.ci, update_json=args.update_json)
|
||||
if args.reader:
|
||||
benchmark_reader(**params, **filenames, ci=args.ci, update_json=args.update_json)
|
||||
|
||||
|
75
test/benchmarks/templates.py
Normal file
75
test/benchmarks/templates.py
Normal file
@ -0,0 +1,75 @@
|
||||
READER_TEMPLATE = {
|
||||
"chart_type": "BarChart",
|
||||
"title": "Reader Performance",
|
||||
"subtitle": "Time and Accuracy Benchmarks",
|
||||
"description": "Performance benchmarks of different Readers that can be used off-the-shelf in Haystack. Some models are geared towards speed, while others are more performance-focused. Accuracy is measured as F1 score and speed as passages/sec (with passages of 384 tokens). Each Reader is benchmarked using the SQuAD v2.0 development set, which contains 11866 question answer pairs. When tokenized using the BERT tokenizer and split using a sliding window approach, these become 12350 passages that are passed into the model. We set <i>max_seq_len=384</i> and <i>doc_stride=128</i>. These benchmarking tests are run using an AWS p3.2xlarge instance with a Nvidia V100 GPU with this <a href='https://github.com/deepset-ai/haystack/blob/master/test/benchmarks/reader.py'>script</a>. Please note that we are using the FARMReader class rather than the TransformersReader class. Also, the F1 measure that is reported here is in fact calculated on token level, rather than word level as is done in the official SQuAD script.",
|
||||
"bars": "horizontal",
|
||||
"columns": [
|
||||
"Model",
|
||||
"F1",
|
||||
"Speed (passages/sec)"
|
||||
],
|
||||
"data": None
|
||||
}
|
||||
|
||||
RETRIEVER_TEMPLATE = {
|
||||
"chart_type": "BarChart",
|
||||
"title": "Retriever Performance",
|
||||
"subtitle": "Time and Accuracy Benchmarks",
|
||||
"description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. <b>Indexing speed</b> (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. <b>Querying speed</b> (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from <a href='https://github.com/facebookresearch/DPR/blob/master/data/download_data.py'>here</a>)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the <i>\"facebook/dpr-question_encoder-single-nq-base\"</i> and <i>\"facebook/dpr-ctx_encoder-single-nq-base\"</i> models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use <i>n_links=128</i>, <i>efSearch=20</i> and <i>efConstruction=80</i>. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.",
|
||||
"bars": "horizontal",
|
||||
"columns": [
|
||||
"Model",
|
||||
"mAP",
|
||||
"Index Speed (docs/sec)",
|
||||
"Query Speed (queries/sec)"
|
||||
],
|
||||
"series": {
|
||||
"s0": "recall",
|
||||
"s1": "time",
|
||||
"s2": "time"
|
||||
},
|
||||
"axes": {
|
||||
"label": "recall",
|
||||
"time_side": "top",
|
||||
"time_label": "seconds"
|
||||
},
|
||||
"data": None
|
||||
}
|
||||
|
||||
RETRIEVER_MAP_TEMPLATE = {
|
||||
"chart_type": "LineChart",
|
||||
"title": "Retriever Accuracy",
|
||||
"subtitle": "mAP at different number of docs",
|
||||
"description": "Here you can see how the mean avg. precision (mAP) of the retriever decays as the number of documents increases. The set up is the same as the above querying benchmark except that a varying number of negative documents are used to fill the document store.",
|
||||
"columns": [
|
||||
"n_docs",
|
||||
"BM25 / ElasticSearch",
|
||||
"DPR / ElasticSearch",
|
||||
"DPR / FAISS (flat)",
|
||||
"DPR / FAISS (HSNW)"
|
||||
],
|
||||
"axis": [
|
||||
{ "x": "Number of docs", "y": "mAP" }
|
||||
],
|
||||
"data": None
|
||||
}
|
||||
|
||||
RETRIEVER_SPEED_TEMPLATE = {
|
||||
"chart_type": "LineChart",
|
||||
"title": "Retriever Speed",
|
||||
"subtitle": "Query Speed at different number of docs",
|
||||
"description": "Here you can see how the query speed of different Retriever / DocumentStore combinations scale as the number of documents increases. The set up is the same as the above querying benchmark except that a varying number of negative documents are used to fill the document store.",
|
||||
"columns": [
|
||||
"n_docs",
|
||||
"BM25 / ElasticSearch",
|
||||
"DPR / ElasticSearch",
|
||||
"DPR / FAISS (flat)",
|
||||
"DPR / FAISS (HSNW)"
|
||||
],
|
||||
"axis": [
|
||||
{ "x": "Number of docs",
|
||||
"y": "Docs/sec"}
|
||||
],
|
||||
"data": None
|
||||
}
|
@ -10,6 +10,7 @@ from haystack.reader.transformers import TransformersReader
|
||||
import logging
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
|
||||
from pathlib import Path
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -70,6 +71,7 @@ def get_document_store(document_store_type, es_similarity='cosine'):
|
||||
|
||||
else:
|
||||
raise Exception(f"No document store fixture for '{document_store_type}'")
|
||||
assert document_store.get_document_count() == 0
|
||||
return document_store
|
||||
|
||||
def get_retriever(retriever_name, doc_store):
|
||||
@ -100,3 +102,20 @@ def index_to_doc_store(doc_store, docs, retriever, labels=None):
|
||||
elif callable(getattr(retriever, "embed_passages", None)) and docs[0].embedding is None:
|
||||
doc_store.update_embeddings(retriever, index=doc_index)
|
||||
|
||||
def load_config(config_filename, ci):
|
||||
conf = json.load(open(config_filename))
|
||||
if ci:
|
||||
params = conf["params"]["ci"]
|
||||
else:
|
||||
params = conf["params"]["full"]
|
||||
filenames = conf["filenames"]
|
||||
max_docs = max(params["n_docs_options"])
|
||||
n_docs_keys = sorted([int(x) for x in list(filenames["embeddings_filenames"])])
|
||||
for k in n_docs_keys:
|
||||
if max_docs <= k:
|
||||
filenames["embeddings_filenames"] = [filenames["embeddings_filenames"][str(k)]]
|
||||
filenames["filename_negative"] = filenames["filenames_negative"][str(k)]
|
||||
break
|
||||
return params, filenames
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user