Merge branch 'master' into fix_website

This commit is contained in:
Branden Chan 2020-10-29 10:29:25 +01:00
commit 3793205aa3
13 changed files with 455 additions and 150 deletions

View File

@ -5,7 +5,7 @@
"description": "Performance benchmarks of different Readers that can be used off-the-shelf in Haystack. Some models are geared towards speed, while others are more performance-focused. Accuracy is measured as F1 score and speed as passages/sec (with passages of 384 tokens). Each Reader is benchmarked using the SQuAD v2.0 development set, which contains 11866 question answer pairs. When tokenized using the BERT tokenizer and split using a sliding window approach, these become 12350 passages that are passed into the model. We set <i>max_seq_len=384</i> and <i>doc_stride=128</i>. These benchmarking tests are run using an AWS p3.2xlarge instance with a Nvidia V100 GPU with this <a href='https://github.com/deepset-ai/haystack/blob/master/test/benchmarks/reader.py'>script</a>. Please note that we are using the FARMReader class rather than the TransformersReader class. Also, the F1 measure that is reported here is in fact calculated on token level, rather than word level as is done in the official SQuAD script.",
"bars": "horizontal",
"columns": [
"Model",
"Model",
"F1",
"Speed (passages/sec)"
],

View File

@ -121,6 +121,7 @@ class Finder:
label_origin: str = "gold_label",
top_k_retriever: int = 10,
top_k_reader: int = 10,
return_preds: bool = False,
):
"""
Evaluation of the whole pipeline by first evaluating the Retriever and then evaluating the Reader on the result
@ -165,6 +166,9 @@ class Finder:
:type top_k_retriever: int
:param top_k_reader: How many answers to return per question
:type top_k_reader: int
:param return_preds: Whether to add predictions in the returned dictionary. If True, the returned dictionary
contains the keys "predictions" and "metrics".
:type return_preds: bool
"""
if not self.reader or not self.retriever:
@ -205,6 +209,7 @@ class Finder:
previous_return_no_answers = self.reader.return_no_answers
self.reader.return_no_answers = True
predictions = []
# extract answers
reader_start_time = time.time()
for q_idx, question_docs in enumerate(questions_with_docs):
@ -215,8 +220,10 @@ class Finder:
question_string = question.question
docs = question_docs["docs"] # type: ignore
single_reader_start = time.time()
predicted_answers = self.reader.predict(question_string, docs, top_k=top_k_reader) # type: ignore
predicted_answers = self.reader.predict(question_string, docs, top_k=top_k_reader) # type: ignore
read_times.append(time.time() - single_reader_start)
if return_preds:
predictions.append(predicted_answers)
counts = eval_counts_reader(question, predicted_answers, counts)
counts["number_of_has_answer"] = counts["correct_retrievals"] - counts["number_of_no_answer"]
@ -240,7 +247,10 @@ class Finder:
eval_results["avg_reader_time"] = mean(read_times)
eval_results["total_finder_time"] = finder_total_time
return eval_results
if return_preds:
return {"metrics": eval_results, "predictions": predictions}
else:
return eval_results
def eval_batch(
self,
@ -249,7 +259,8 @@ class Finder:
label_origin: str = "gold_label",
top_k_retriever: int = 10,
top_k_reader: int = 10,
batch_size: int = 50
batch_size: int = 50,
return_preds: bool = False,
):
"""
Evaluation of the whole pipeline by first evaluating the Retriever and then evaluating the Reader on the result
@ -296,10 +307,13 @@ class Finder:
:type top_k_reader: int
:param batch_size: Number of samples per batch computed at once
:type batch_size: int
:param return_preds: Whether to add predictions in the returned dictionary. If True, the returned dictionary
contains the keys "predictions" and "metrics".
:type return_preds: bool
"""
if not self.reader or not self.retriever:
raise Exception("Finder needs to have a reader and retriever for the evalutaion.")
raise Exception("Finder needs to have a reader and retriever for the evaluation.")
counts = defaultdict(float) # type: Dict[str, float]
finder_start_time = time.time()
@ -344,7 +358,10 @@ class Finder:
logger.info(f"{number_of_questions - correct_retrievals} questions could not be answered due to the retriever.")
logger.info(f"{correct_retrievals - counts['correct_readings_topk']} questions could not be answered due to the reader.")
return results
if return_preds:
return {"metrics": results, "predictions": predictions}
else:
return results
def _retrieve_docs(self, questions: List[MultiLabel], top_k: int, doc_index: str):
@ -364,6 +381,9 @@ class Finder:
@staticmethod
def print_eval_results(finder_eval_results: Dict):
if "predictions" in finder_eval_results.keys():
finder_eval_results = finder_eval_results["metrics"]
print("\n___Retriever Metrics in Finder___")
print(f"Retriever Recall : {finder_eval_results['retriever_recall']:.3f}")
print(f"Retriever Mean Avg Precision: {finder_eval_results['retriever_map']:.3f}")

View File

@ -19,12 +19,13 @@ from haystack.file_converter.txt import TextConverter
logger = logging.getLogger(__name__)
def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]:
def eval_data_from_file(filename: str, max_docs: Union[int, bool]=None) -> Tuple[List[Document], List[Label]]:
"""
Read Documents + Labels from a SQuAD-style file.
Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
:param filename: Path to file in SQuAD format
:param max_docs: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents.
:return: (List of Documents, List of Labels)
"""
docs = []
@ -34,7 +35,7 @@ def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]:
data = json.load(file)
if "title" not in data["data"][0]:
logger.warning(f"No title information found for documents in QA file: {filename}")
for document in data["data"]:
for document in data["data"][:max_docs]:
# get all extra fields from document level (e.g. title)
meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")}
for paragraph in document["paragraphs"]:

View File

@ -291,7 +291,7 @@ class FARMReader(BaseReader):
result = []
for idx, group in enumerate(grouped_predictions):
answers, max_no_ans_gap = self._extract_answers_of_predictions(group, top_k_per_question)
question = group[0]
question = group[0].question
cur_label = labels[idx]
result.append({
"question": question,

View File

@ -45,7 +45,8 @@ class BaseRetriever(ABC):
doc_index: str = "eval_document",
label_origin: str = "gold_label",
top_k: int = 10,
open_domain: bool = False
open_domain: bool = False,
return_preds: bool = False,
) -> dict:
"""
Performs evaluation on the Retriever.
@ -65,6 +66,8 @@ class BaseRetriever(ABC):
contained in the retrieved docs (common approach in open-domain QA).
If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids
are within ids explicitly stated in the labels.
:param return_preds: Whether to add predictions in the returned dictionary. If True, the returned dictionary
contains the keys "predictions" and "metrics".
"""
# Extract all questions for evaluation
@ -86,11 +89,15 @@ class BaseRetriever(ABC):
deduplicated_doc_ids = list(set([str(x) for x in label.multiple_document_ids]))
question_label_dict[label.question] = deduplicated_doc_ids
predictions = []
# Option 1: Open-domain evaluation by checking if the answer string is in the retrieved docs
logger.info("Performing eval queries...")
if open_domain:
for question, gold_answers in tqdm(question_label_dict.items()):
retrieved_docs = timed_retrieve(question, top_k=top_k, index=doc_index)
if return_preds:
predictions.append({"question": question, "retrieved_docs": retrieved_docs})
# check if correct doc in retrieved docs
for doc_idx, doc in enumerate(retrieved_docs):
for gold_answer in gold_answers:
@ -102,6 +109,8 @@ class BaseRetriever(ABC):
else:
for question, gold_ids in tqdm(question_label_dict.items()):
retrieved_docs = timed_retrieve(question, top_k=top_k, index=doc_index)
if return_preds:
predictions.append({"question": question, "retrieved_docs": retrieved_docs})
# check if correct doc in retrieved docs
for doc_idx, doc in enumerate(retrieved_docs):
for gold_id in gold_ids:
@ -117,4 +126,15 @@ class BaseRetriever(ABC):
logger.info((f"For {correct_retrievals} out of {number_of_questions} questions ({recall:.2%}), the answer was in"
f" the top-{top_k} candidate passages selected by the retriever."))
return {"recall": recall, "map": mean_avg_precision, "retrieve_time": self.retrieve_time, "n_questions": number_of_questions, "top_k": top_k}
metrics = {
"recall": recall,
"map": mean_avg_precision,
"retrieve_time": self.retrieve_time,
"n_questions": number_of_questions,
"top_k": top_k
}
if return_preds:
return {"metrics": metrics, "predictions": predictions}
else:
return metrics

20
test/benchmarks/README.md Normal file
View File

@ -0,0 +1,20 @@
# Benchmarks
Run the benchmarks with the following command:
```
python run.py [--reader] [--retriever_index] [--retriever_query] [--ci] [--update-json]
```
You can specify which components and processes to benchmark with the following flags.
**--reader** will trigger the speed and accuracy benchmarks for the reader. Here we simply use the SQuAD dev set.
**--retriever_index** will trigger indexing benchmarks
**--retriever_query** will trigger querying benchmarks (embeddings will be loaded from file instead of being computed on the fly)
**--ci** will cause the the benchmarks to run on a smaller slice of each dataset and a smaller subset of Retriever / Reader / DocStores.
**--update-json** will cause the script to update the json files in docs/_src/benchmarks so that the website benchmarks will be updated.

View File

@ -0,0 +1,57 @@
{
"params": {
"full": {
"retriever_doc_stores": [
[
"elastic",
"elasticsearch"
],
[
"dpr",
"elasticsearch"
],
[
"dpr",
"faiss_flat"
],
[
"dpr",
"faiss_hnsw"
]
],
"n_docs_options": [
10000,
100000,
500000
],
"n_queries": null
},
"ci": {
"retriever_doc_stores": [
[
"elastic",
"elasticsearch"
]
],
"n_docs_options": [
1000
],
"n_queries": 10
}
},
"filenames": {
"data_s3_url": "s3://ext-haystack-retriever-eval/",
"data_dir": "../../data/retriever/",
"filename_gold": "nq2squad-dev.json",
"filenames_negative": {
"10000": "psgs_w100_minus_gold_10k.tsv",
"100000": "psgs_w100_minus_gold_100k.tsv",
"1000000": "psgs_w100_minus_gold_1m.tsv"
},
"embeddings_dir": "embeddings/",
"embeddings_filenames": {
"10000": "wikipedia_passages_10k.pkl",
"100000": "wikipedia_passages_100k.pkl",
"1000000": "wikipedia_passages_1m.pkl"}
}
}

View File

@ -2,10 +2,15 @@ from utils import get_document_store, index_to_doc_store, get_reader
from haystack.preprocessor.utils import eval_data_from_file
from pathlib import Path
import pandas as pd
from results_to_json import reader as reader_json
from templates import READER_TEMPLATE
import json
reader_models = ["deepset/roberta-base-squad2", "deepset/minilm-uncased-squad2",
reader_models_full = ["deepset/roberta-base-squad2", "deepset/minilm-uncased-squad2",
"deepset/bert-base-cased-squad2", "deepset/bert-large-uncased-whole-word-masking-squad2",
"deepset/xlm-roberta-large-squad2", "distilbert-base-uncased-distilled-squad"]
reader_models_ci = ["deepset/minilm-uncased-squad2"]
reader_types = ["farm"]
data_dir = Path("../../data/squad20")
@ -14,13 +19,24 @@ filename = "dev-v2.0.json"
# This number could vary when using a different tokenizer
n_passages = 12350
results_file = "reader_results.csv"
reader_json_file = "../../docs/_src/benchmarks/reader_performance.json"
doc_index = "eval_document"
label_index = "label"
def benchmark_reader():
def benchmark_reader(ci=False, update_json=False, **kwargs):
if ci:
reader_models = reader_models_ci
n_docs = 1
else:
reader_models = reader_models_full
n_docs = None
reader_results = []
doc_store = get_document_store("elasticsearch")
docs, labels = eval_data_from_file(data_dir/filename)
docs, labels = eval_data_from_file(data_dir/filename, n_docs)
index_to_doc_store(doc_store, docs, None, labels)
for reader_name in reader_models:
for reader_type in reader_types:
@ -47,8 +63,17 @@ def benchmark_reader():
"error": e}
reader_results.append(results)
reader_df = pd.DataFrame.from_records(reader_results)
reader_df.to_csv("reader_results.csv")
reader_df.to_csv(results_file)
if update_json:
populate_reader_json()
def populate_reader_json():
reader_results = reader_json()
template = READER_TEMPLATE
template["data"] = reader_results
json.dump(template, open(reader_json_file, "w"), indent=4)
if __name__ == "__main__":
benchmark_reader()
benchmark_reader(True, update_json=True)

View File

@ -2,8 +2,8 @@ import json
import pandas as pd
from pprint import pprint
def reader():
def reader(reader_csv="reader_results.csv"):
model_rename_map = {
'deepset/roberta-base-squad2': "RoBERTa",
'deepset/minilm-uncased-squad2': "MiniLM",
@ -18,18 +18,18 @@ def reader():
"reader": "Model"
}
df = pd.read_csv("reader_results.csv")
df = pd.read_csv(reader_csv)
df = df[["f1", "passages_per_second", "reader"]]
df["reader"] = df["reader"].map(model_rename_map)
df = df[list(column_name_map)]
df = df.rename(columns=column_name_map)
ret = [dict(row) for i, row in df.iterrows()]
print("Reader overview")
print(json.dumps(ret, indent=2))
def retriever():
print(json.dumps(ret, indent=4))
return ret
def retriever(index_csv="retriever_index_results.csv", query_csv="retriever_query_results.csv"):
column_name_map = {
"model": "model",
"n_docs": "n_docs",
@ -47,8 +47,8 @@ def retriever():
"faiss_hnsw": "FAISS (HSNW)"
}
index = pd.read_csv("retriever_index_results.csv")
query = pd.read_csv("retriever_query_results.csv")
index = pd.read_csv(index_csv)
query = pd.read_csv(query_csv)
df = pd.merge(index, query,
how="right",
left_on=["retriever", "doc_store", "n_docs"],
@ -62,36 +62,38 @@ def retriever():
df = df.rename(columns=column_name_map)
print("Retriever overview")
print(retriever_overview(df))
retriever_overview_data = retriever_overview(df)
print(json.dumps(retriever_overview_data, indent=4))
print("Retriever MAP")
print(retriever_map(df))
retriever_map_data = retriever_map(df)
print(json.dumps(retriever_map_data, indent=4))
print("Retriever Speed")
print(retriever_speed(df))
retriever_speed_data = retriever_speed(df)
print(json.dumps(retriever_speed_data, indent=4))
return retriever_overview_data, retriever_map_data, retriever_speed_data
def retriever_map(df):
columns = ["model", "n_docs", "map"]
df = df[columns]
ret = df.to_dict(orient="records")
return json.dumps(ret, indent=4)
return ret
def retriever_speed(df):
columns = ["model", "n_docs", "query_speed"]
df = df[columns]
ret = df.to_dict(orient="records")
return json.dumps(ret, indent=4)
return ret
def retriever_overview(df, chosen_n_docs=100_000):
df = df[df["n_docs"] == chosen_n_docs]
ret = [dict(row) for i, row in df.iterrows()]
return json.dumps(ret, indent=2)
return ret
if __name__ == "__main__":

View File

@ -1,7 +1,7 @@
import pandas as pd
from pathlib import Path
from time import perf_counter
from utils import get_document_store, get_retriever, index_to_doc_store
from utils import get_document_store, get_retriever, index_to_doc_store, load_config
from haystack.preprocessor.utils import eval_data_from_file
from haystack import Document
import pickle
@ -11,142 +11,122 @@ import logging
import datetime
import random
import traceback
import os
import requests
from farm.file_utils import download_from_s3
import json
from results_to_json import retriever as retriever_json
from templates import RETRIEVER_TEMPLATE, RETRIEVER_MAP_TEMPLATE, RETRIEVER_SPEED_TEMPLATE
logger = logging.getLogger(__name__)
logging.getLogger("haystack.retriever.base").setLevel(logging.WARN)
logging.getLogger("elasticsearch").setLevel(logging.WARN)
es_similarity = "dot_product"
retriever_doc_stores = [
# ("elastic", "elasticsearch"),
# ("dpr", "elasticsearch"),
# ("dpr", "faiss_flat"),
("dpr", "faiss_hnsw")
]
n_docs_options = [
1000,
10000,
100000,
500000,
]
# If set to None, querying will be run on all queries
n_queries = None
data_dir = Path("../../data/retriever")
filename_gold = "nq2squad-dev.json" # Found at s3://ext-haystack-retriever-eval
filename_negative = "psgs_w100_minus_gold.tsv" # Found at s3://ext-haystack-retriever-eval
embeddings_dir = Path("embeddings")
embeddings_filenames = [f"wikipedia_passages_1m.pkl"] # Found at s3://ext-haystack-retriever-eval
doc_index = "eval_document"
label_index = "label"
seed = 42
index_results_file = "retriever_index_results.csv"
query_results_file = "retriever_query_results.csv"
overview_json = "../../docs/_src/benchmarks/retriever_performance.json"
map_json = "../../docs/_src/benchmarks/retriever_map.json"
speed_json = "../../docs/_src/benchmarks/retriever_speed.json"
seed = 42
random.seed(42)
def prepare_data(data_dir, filename_gold, filename_negative, n_docs=None, n_queries=None, add_precomputed=False):
"""
filename_gold points to a squad format file.
filename_negative points to a csv file where the first column is doc_id and second is document text.
If add_precomputed is True, this fn will look in the embeddings files for precomputed embeddings to add to each Document
"""
gold_docs, labels = eval_data_from_file(data_dir / filename_gold)
# Reduce number of docs
gold_docs = gold_docs[:n_docs]
# Remove labels whose gold docs have been removed
doc_ids = [x.id for x in gold_docs]
labels = [x for x in labels if x.document_id in doc_ids]
# Filter labels down to n_queries
selected_queries = list(set(f"{x.document_id} | {x.question}" for x in labels))
selected_queries = selected_queries[:n_queries]
labels = [x for x in labels if f"{x.document_id} | {x.question}" in selected_queries]
n_neg_docs = max(0, n_docs - len(gold_docs))
neg_docs = prepare_negative_passages(data_dir, filename_negative, n_neg_docs)
docs = gold_docs + neg_docs
if add_precomputed:
docs = add_precomputed_embeddings(data_dir / embeddings_dir, embeddings_filenames, docs)
return docs, labels
def prepare_negative_passages(data_dir, filename_negative, n_docs):
if n_docs == 0:
return []
with open(data_dir / filename_negative) as f:
lines = []
_ = f.readline() # Skip column titles line
for _ in range(n_docs):
lines.append(f.readline()[:-1])
docs = []
for l in lines[:n_docs]:
id, text, title = l.split("\t")
d = {"text": text,
"meta": {"passage_id": int(id),
"title": title}}
d = Document(**d)
docs.append(d)
return docs
def benchmark_indexing():
def benchmark_indexing(n_docs_options, retriever_doc_stores, data_dir, filename_gold, filename_negative, data_s3_url, embeddings_filenames, embeddings_dir, update_json, **kwargs):
retriever_results = []
for n_docs in n_docs_options:
for retriever_name, doc_store_name in retriever_doc_stores:
doc_store = get_document_store(doc_store_name, es_similarity=es_similarity)
logger.info(f"##### Start indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ")
try:
doc_store = get_document_store(doc_store_name)
retriever = get_retriever(retriever_name, doc_store)
docs, _ = prepare_data(data_dir=data_dir,
filename_gold=filename_gold,
filename_negative=filename_negative,
data_s3_url=data_s3_url,
embeddings_filenames=embeddings_filenames,
embeddings_dir=embeddings_dir,
n_docs=n_docs)
retriever = get_retriever(retriever_name, doc_store)
tic = perf_counter()
index_to_doc_store(doc_store, docs, retriever)
toc = perf_counter()
indexing_time = toc - tic
docs, _ = prepare_data(data_dir, filename_gold, filename_negative, n_docs=n_docs)
print(indexing_time)
tic = perf_counter()
index_to_doc_store(doc_store, docs, retriever)
toc = perf_counter()
indexing_time = toc - tic
retriever_results.append({
"retriever": retriever_name,
"doc_store": doc_store_name,
"n_docs": n_docs,
"indexing_time": indexing_time,
"docs_per_second": n_docs / indexing_time,
"date_time": datetime.datetime.now(),
"error": None})
retriever_df = pd.DataFrame.from_records(retriever_results)
retriever_df = retriever_df.sort_values(by="retriever").sort_values(by="doc_store")
retriever_df.to_csv(index_results_file)
doc_store.delete_all_documents(index=doc_index)
doc_store.delete_all_documents(index=label_index)
time.sleep(10)
del doc_store
del retriever
print(indexing_time)
except Exception as e:
tb = traceback.format_exc()
logging.ERROR(f"##### The following Error was raised while running indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs #####")
logging.Error(tb)
retriever_results.append({
"retriever": retriever_name,
"doc_store": doc_store_name,
"n_docs": n_docs,
"indexing_time": 0,
"docs_per_second": 0,
"date_time": datetime.datetime.now(),
"error": str(tb)})
doc_store.delete_all_documents(index=doc_index)
doc_store.delete_all_documents(index=label_index)
time.sleep(10)
del doc_store
del retriever
if update_json:
populate_retriever_json()
retriever_results.append({
"retriever": retriever_name,
"doc_store": doc_store_name,
"n_docs": n_docs,
"indexing_time": indexing_time,
"docs_per_second": n_docs / indexing_time,
"date_time": datetime.datetime.now()})
retriever_df = pd.DataFrame.from_records(retriever_results)
retriever_df = retriever_df.sort_values(by="retriever").sort_values(by="doc_store")
retriever_df.to_csv("retriever_index_results.csv")
doc_store.delete_all_documents(index=doc_index)
doc_store.delete_all_documents(index=label_index)
time.sleep(10)
del doc_store
del retriever
def benchmark_querying():
def benchmark_querying(n_docs_options,
retriever_doc_stores,
data_dir,
data_s3_url,
filename_gold,
filename_negative,
n_queries,
embeddings_filenames,
embeddings_dir,
update_json,
**kwargs):
""" Benchmark the time it takes to perform querying. Doc embeddings are loaded from file."""
retriever_results = []
for n_docs in n_docs_options:
for retriever_name, doc_store_name in retriever_doc_stores:
try:
logger.info(f"##### Start run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ")
doc_store = get_document_store(doc_store_name, es_similarity=es_similarity)
logger.info(f"##### Start querying run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ")
doc_store = get_document_store(doc_store_name)
retriever = get_retriever(retriever_name, doc_store)
add_precomputed = retriever_name in ["dpr"]
# For DPR, precomputed embeddings are loaded from file
docs, labels = prepare_data(data_dir,
filename_gold,
filename_negative,
docs, labels = prepare_data(data_dir=data_dir,
filename_gold=filename_gold,
filename_negative=filename_negative,
data_s3_url=data_s3_url,
embeddings_filenames=embeddings_filenames,
embeddings_dir=embeddings_dir,
n_docs=n_docs,
n_queries=n_queries,
add_precomputed=add_precomputed)
@ -170,12 +150,15 @@ def benchmark_querying():
"error": None
}
doc_store.delete_all_documents()
doc_store.delete_all_documents(index=doc_index)
doc_store.delete_all_documents(index=label_index)
time.sleep(5)
del doc_store
del retriever
except Exception as e:
tb = traceback.format_exc()
logging.ERROR(f"##### The following Error was raised while running querying run: {retriever_name}, {doc_store_name}, {n_docs} docs #####")
logging.Error(tb)
results = {
"retriever": retriever_name,
"doc_store": doc_store_name,
@ -190,22 +173,41 @@ def benchmark_querying():
"date_time": datetime.datetime.now(),
"error": str(tb)
}
doc_store.delete_all_documents(index=doc_index)
doc_store.delete_all_documents(index=label_index)
time.sleep(5)
del doc_store
del retriever
logger.info(results)
retriever_results.append(results)
retriever_df = pd.DataFrame.from_records(retriever_results)
retriever_df = retriever_df.sort_values(by="retriever").sort_values(by="doc_store")
retriever_df.to_csv("retriever_query_results.csv")
retriever_df.to_csv(query_results_file)
if update_json:
populate_retriever_json()
def populate_retriever_json():
retriever_overview_data, retriever_map_data, retriever_speed_data = retriever_json(index_csv=index_results_file,
query_csv=query_results_file)
overview = RETRIEVER_TEMPLATE
overview["data"] = retriever_overview_data
map = RETRIEVER_MAP_TEMPLATE
map["data"] = retriever_map_data
speed = RETRIEVER_SPEED_TEMPLATE
speed["data"] = retriever_speed_data
json.dump(overview, open(overview_json, "w"), indent=4)
json.dump(speed, open(speed_json, "w"), indent=4)
json.dump(map, open(map_json, "w"), indent=4)
def add_precomputed_embeddings(embeddings_dir, embeddings_filenames, docs):
ret = []
id_to_doc = {x.meta["passage_id"]: x for x in docs}
for ef in embeddings_filenames:
logger.info(f"Adding precomputed embeddings from {embeddings_dir / ef}")
filename = embeddings_dir / ef
logger.info(f"Adding precomputed embeddings from {embeddings_dir + ef}")
filename = embeddings_dir + ef
embeds = pickle.load(open(filename, "rb"))
for i, vec in embeds:
if int(i) in id_to_doc:
@ -219,7 +221,66 @@ def add_precomputed_embeddings(embeddings_dir, embeddings_filenames, docs):
return ret
if __name__ == "__main__":
benchmark_indexing()
benchmark_querying()
def prepare_data(data_dir, filename_gold, filename_negative, data_s3_url, embeddings_filenames, embeddings_dir, n_docs=None, n_queries=None, add_precomputed=False):
"""
filename_gold points to a squad format file.
filename_negative points to a csv file where the first column is doc_id and second is document text.
If add_precomputed is True, this fn will look in the embeddings files for precomputed embeddings to add to each Document
"""
logging.getLogger("farm").setLevel(logging.INFO)
download_from_s3(data_s3_url + filename_gold, cache_dir=data_dir)
download_from_s3(data_s3_url + filename_negative, cache_dir=data_dir)
if add_precomputed:
for embedding_filename in embeddings_filenames:
download_from_s3(data_s3_url + str(embeddings_dir) + embedding_filename, cache_dir=data_dir)
logging.getLogger("farm").setLevel(logging.WARN)
gold_docs, labels = eval_data_from_file(data_dir + filename_gold)
# Reduce number of docs
gold_docs = gold_docs[:n_docs]
# Remove labels whose gold docs have been removed
doc_ids = [x.id for x in gold_docs]
labels = [x for x in labels if x.document_id in doc_ids]
# Filter labels down to n_queries
selected_queries = list(set(f"{x.document_id} | {x.question}" for x in labels))
selected_queries = selected_queries[:n_queries]
labels = [x for x in labels if f"{x.document_id} | {x.question}" in selected_queries]
n_neg_docs = max(0, n_docs - len(gold_docs))
neg_docs = prepare_negative_passages(data_dir, filename_negative, n_neg_docs)
docs = gold_docs + neg_docs
if add_precomputed:
docs = add_precomputed_embeddings(data_dir + embeddings_dir, embeddings_filenames, docs)
return docs, labels
def prepare_negative_passages(data_dir, filename_negative, n_docs):
if n_docs == 0:
return []
with open(data_dir + filename_negative) as f:
lines = []
_ = f.readline() # Skip column titles line
for _ in range(n_docs):
lines.append(f.readline()[:-1])
docs = []
for l in lines[:n_docs]:
id, text, title = l.split("\t")
d = {"text": text,
"meta": {"passage_id": int(id),
"title": title}}
d = Document(**d)
docs.append(d)
return docs
if __name__ == "__main__":
params, filenames = load_config(config_filename="config.json", ci=True)
benchmark_indexing(**params, **filenames)
benchmark_querying(**params, **filenames)

View File

@ -1,7 +1,10 @@
from retriever import benchmark_indexing, benchmark_querying
from reader import benchmark_reader
from utils import load_config
import argparse
params, filenames = load_config(config_filename="config.json", ci=True)
parser = argparse.ArgumentParser()
parser.add_argument('--reader', default=False, action="store_true",
@ -12,13 +15,15 @@ parser.add_argument('--retriever_query', default=False, action="store_true",
help='Perform Retriever querying benchmarks')
parser.add_argument('--ci', default=False, action="store_true",
help='Perform a smaller subset of benchmarks that are quicker to run')
parser.add_argument('--update_json', default=False, action="store_true",
help='Update the json file with the results of this run so that the website can be updated')
args = parser.parse_args()
if args.retriever_index:
benchmark_indexing(ci)
benchmark_indexing(**params, **filenames, ci=args.ci, update_json=args.update_json)
if args.retriever_query:
benchmark_querying(ci)
if args.retriever_reader:
benchmark_reader(ci)
benchmark_querying(**params, **filenames, ci=args.ci, update_json=args.update_json)
if args.reader:
benchmark_reader(**params, **filenames, ci=args.ci, update_json=args.update_json)

View File

@ -0,0 +1,75 @@
READER_TEMPLATE = {
"chart_type": "BarChart",
"title": "Reader Performance",
"subtitle": "Time and Accuracy Benchmarks",
"description": "Performance benchmarks of different Readers that can be used off-the-shelf in Haystack. Some models are geared towards speed, while others are more performance-focused. Accuracy is measured as F1 score and speed as passages/sec (with passages of 384 tokens). Each Reader is benchmarked using the SQuAD v2.0 development set, which contains 11866 question answer pairs. When tokenized using the BERT tokenizer and split using a sliding window approach, these become 12350 passages that are passed into the model. We set <i>max_seq_len=384</i> and <i>doc_stride=128</i>. These benchmarking tests are run using an AWS p3.2xlarge instance with a Nvidia V100 GPU with this <a href='https://github.com/deepset-ai/haystack/blob/master/test/benchmarks/reader.py'>script</a>. Please note that we are using the FARMReader class rather than the TransformersReader class. Also, the F1 measure that is reported here is in fact calculated on token level, rather than word level as is done in the official SQuAD script.",
"bars": "horizontal",
"columns": [
"Model",
"F1",
"Speed (passages/sec)"
],
"data": None
}
RETRIEVER_TEMPLATE = {
"chart_type": "BarChart",
"title": "Retriever Performance",
"subtitle": "Time and Accuracy Benchmarks",
"description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. <b>Indexing speed</b> (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. <b>Querying speed</b> (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from <a href='https://github.com/facebookresearch/DPR/blob/master/data/download_data.py'>here</a>)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the <i>\"facebook/dpr-question_encoder-single-nq-base\"</i> and <i>\"facebook/dpr-ctx_encoder-single-nq-base\"</i> models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use <i>n_links=128</i>, <i>efSearch=20</i> and <i>efConstruction=80</i>. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.",
"bars": "horizontal",
"columns": [
"Model",
"mAP",
"Index Speed (docs/sec)",
"Query Speed (queries/sec)"
],
"series": {
"s0": "recall",
"s1": "time",
"s2": "time"
},
"axes": {
"label": "recall",
"time_side": "top",
"time_label": "seconds"
},
"data": None
}
RETRIEVER_MAP_TEMPLATE = {
"chart_type": "LineChart",
"title": "Retriever Accuracy",
"subtitle": "mAP at different number of docs",
"description": "Here you can see how the mean avg. precision (mAP) of the retriever decays as the number of documents increases. The set up is the same as the above querying benchmark except that a varying number of negative documents are used to fill the document store.",
"columns": [
"n_docs",
"BM25 / ElasticSearch",
"DPR / ElasticSearch",
"DPR / FAISS (flat)",
"DPR / FAISS (HSNW)"
],
"axis": [
{ "x": "Number of docs", "y": "mAP" }
],
"data": None
}
RETRIEVER_SPEED_TEMPLATE = {
"chart_type": "LineChart",
"title": "Retriever Speed",
"subtitle": "Query Speed at different number of docs",
"description": "Here you can see how the query speed of different Retriever / DocumentStore combinations scale as the number of documents increases. The set up is the same as the above querying benchmark except that a varying number of negative documents are used to fill the document store.",
"columns": [
"n_docs",
"BM25 / ElasticSearch",
"DPR / ElasticSearch",
"DPR / FAISS (flat)",
"DPR / FAISS (HSNW)"
],
"axis": [
{ "x": "Number of docs",
"y": "Docs/sec"}
],
"data": None
}

View File

@ -10,6 +10,7 @@ from haystack.reader.transformers import TransformersReader
import logging
import subprocess
import time
import json
from pathlib import Path
logger = logging.getLogger(__name__)
@ -70,6 +71,7 @@ def get_document_store(document_store_type, es_similarity='cosine'):
else:
raise Exception(f"No document store fixture for '{document_store_type}'")
assert document_store.get_document_count() == 0
return document_store
def get_retriever(retriever_name, doc_store):
@ -100,3 +102,20 @@ def index_to_doc_store(doc_store, docs, retriever, labels=None):
elif callable(getattr(retriever, "embed_passages", None)) and docs[0].embedding is None:
doc_store.update_embeddings(retriever, index=doc_index)
def load_config(config_filename, ci):
conf = json.load(open(config_filename))
if ci:
params = conf["params"]["ci"]
else:
params = conf["params"]["full"]
filenames = conf["filenames"]
max_docs = max(params["n_docs_options"])
n_docs_keys = sorted([int(x) for x in list(filenames["embeddings_filenames"])])
for k in n_docs_keys:
if max_docs <= k:
filenames["embeddings_filenames"] = [filenames["embeddings_filenames"][str(k)]]
filenames["filename_negative"] = filenames["filenames_negative"][str(k)]
break
return params, filenames