From 6d60cc945119e2b6deab1c79dcb5a6f67d8b1d4f Mon Sep 17 00:00:00 2001 From: brandenchan Date: Thu, 15 Oct 2020 18:12:17 +0200 Subject: [PATCH 1/9] add automation pipeline --- haystack/preprocessor/utils.py | 4 +- test/benchmarks/config.json | 69 +++++++++ test/benchmarks/reader.py | 16 ++- test/benchmarks/retriever.py | 252 ++++++++++++++++++--------------- test/benchmarks/run.py | 9 +- test/benchmarks/utils.py | 17 +++ 6 files changed, 246 insertions(+), 121 deletions(-) create mode 100644 test/benchmarks/config.json diff --git a/haystack/preprocessor/utils.py b/haystack/preprocessor/utils.py index 26bb725de..caac1453b 100644 --- a/haystack/preprocessor/utils.py +++ b/haystack/preprocessor/utils.py @@ -19,7 +19,7 @@ from haystack.file_converter.txt import TextConverter logger = logging.getLogger(__name__) -def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]: +def eval_data_from_file(filename: str, n_docs: Union[int, bool]=None) -> Tuple[List[Document], List[Label]]: """ Read Documents + Labels from a SQuAD-style file. Document and Labels can then be indexed to the DocumentStore and be used for evaluation. @@ -32,7 +32,7 @@ def eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]: with open(filename, "r") as file: data = json.load(file) - for document in data["data"]: + for document in data["data"][:n_docs]: # get all extra fields from document level (e.g. title) meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")} for paragraph in document["paragraphs"]: diff --git a/test/benchmarks/config.json b/test/benchmarks/config.json new file mode 100644 index 000000000..198874b0e --- /dev/null +++ b/test/benchmarks/config.json @@ -0,0 +1,69 @@ +{ + "params": { + "full": { + "retriever_doc_stores": [ + [ + "elastic", + "elasticsearch" + ], + [ + "dpr", + "elasticsearch" + ], + [ + "dpr", + "faiss_flat" + ], + [ + "dpr", + "faiss_hnsw" + ] + ], + "n_docs_options": [ + 10000, + 100000, + 500000 + ], + "n_queries": null + }, + "ci": { + "retriever_doc_stores": [ + [ + "elastic", + "elasticsearch" + ], + [ + "dpr", + "elasticsearch" + ], + [ + "dpr", + "faiss_flat" + ], + [ + "dpr", + "faiss_hnsw" + ] + ], + "n_docs_options": [ + 1000 + ], + "n_queries": 10 + } + }, + "filenames": { + "data_s3_url": "s3://ext-haystack-retriever-eval/", + "data_dir": "../../data/retriever/", + "filename_gold": "nq2squad-dev.json", + "filenames_negative": { + "10000": "psgs_w100_minus_gold_10k.tsv", + "100000": "psgs_w100_minus_gold_100k.tsv", + "1000000": "psgs_w100_minus_gold_1m.tsv" + }, + "embeddings_dir": "embeddings/", + "embeddings_filenames": { + "10000": "wikipedia_passages_10k.pkl", + "100000": "wikipedia_passages_100k.pkl", + "1000000": "wikipedia_passages_1m.pkl"} + } +} \ No newline at end of file diff --git a/test/benchmarks/reader.py b/test/benchmarks/reader.py index 534556879..240164d05 100644 --- a/test/benchmarks/reader.py +++ b/test/benchmarks/reader.py @@ -3,9 +3,10 @@ from haystack.preprocessor.utils import eval_data_from_file from pathlib import Path import pandas as pd -reader_models = ["deepset/roberta-base-squad2", "deepset/minilm-uncased-squad2", +reader_models_full = ["deepset/roberta-base-squad2", "deepset/minilm-uncased-squad2", "deepset/bert-base-cased-squad2", "deepset/bert-large-uncased-whole-word-masking-squad2", "deepset/xlm-roberta-large-squad2", "distilbert-base-uncased-distilled-squad"] +reader_models_ci = ["deepset/minilm-uncased-squad2"] reader_types = ["farm"] data_dir = Path("../../data/squad20") @@ -17,10 +18,17 @@ n_passages = 12350 doc_index = "eval_document" label_index = "label" -def benchmark_reader(): +def benchmark_reader(ci=False): + if ci: + reader_models = reader_models_ci + n_docs = 1 + else: + reader_models = reader_models_full + n_docs = None reader_results = [] doc_store = get_document_store("elasticsearch") - docs, labels = eval_data_from_file(data_dir/filename) + docs, labels = eval_data_from_file(data_dir/filename, n_docs) + index_to_doc_store(doc_store, docs, None, labels) for reader_name in reader_models: for reader_type in reader_types: @@ -51,4 +59,4 @@ def benchmark_reader(): if __name__ == "__main__": - benchmark_reader() \ No newline at end of file + benchmark_reader(True) \ No newline at end of file diff --git a/test/benchmarks/retriever.py b/test/benchmarks/retriever.py index d3c637e4e..8c4acf820 100644 --- a/test/benchmarks/retriever.py +++ b/test/benchmarks/retriever.py @@ -1,7 +1,7 @@ import pandas as pd from pathlib import Path from time import perf_counter -from utils import get_document_store, get_retriever, index_to_doc_store +from utils import get_document_store, get_retriever, index_to_doc_store, load_config from haystack.preprocessor.utils import eval_data_from_file from haystack import Document import pickle @@ -11,142 +11,107 @@ import logging import datetime import random import traceback +import os +import requests +from farm.file_utils import download_from_s3 +import json + logger = logging.getLogger(__name__) logging.getLogger("haystack.retriever.base").setLevel(logging.WARN) logging.getLogger("elasticsearch").setLevel(logging.WARN) -es_similarity = "dot_product" - -retriever_doc_stores = [ - # ("elastic", "elasticsearch"), - # ("dpr", "elasticsearch"), - # ("dpr", "faiss_flat"), - ("dpr", "faiss_hnsw") -] - -n_docs_options = [ - 1000, - 10000, - 100000, - 500000, -] - -# If set to None, querying will be run on all queries -n_queries = None -data_dir = Path("../../data/retriever") -filename_gold = "nq2squad-dev.json" # Found at s3://ext-haystack-retriever-eval -filename_negative = "psgs_w100_minus_gold.tsv" # Found at s3://ext-haystack-retriever-eval -embeddings_dir = Path("embeddings") -embeddings_filenames = [f"wikipedia_passages_1m.pkl"] # Found at s3://ext-haystack-retriever-eval - doc_index = "eval_document" label_index = "label" seed = 42 - random.seed(42) - -def prepare_data(data_dir, filename_gold, filename_negative, n_docs=None, n_queries=None, add_precomputed=False): - """ - filename_gold points to a squad format file. - filename_negative points to a csv file where the first column is doc_id and second is document text. - If add_precomputed is True, this fn will look in the embeddings files for precomputed embeddings to add to each Document - """ - - gold_docs, labels = eval_data_from_file(data_dir / filename_gold) - - # Reduce number of docs - gold_docs = gold_docs[:n_docs] - - # Remove labels whose gold docs have been removed - doc_ids = [x.id for x in gold_docs] - labels = [x for x in labels if x.document_id in doc_ids] - - # Filter labels down to n_queries - selected_queries = list(set(f"{x.document_id} | {x.question}" for x in labels)) - selected_queries = selected_queries[:n_queries] - labels = [x for x in labels if f"{x.document_id} | {x.question}" in selected_queries] - - n_neg_docs = max(0, n_docs - len(gold_docs)) - neg_docs = prepare_negative_passages(data_dir, filename_negative, n_neg_docs) - docs = gold_docs + neg_docs - - if add_precomputed: - docs = add_precomputed_embeddings(data_dir / embeddings_dir, embeddings_filenames, docs) - - return docs, labels - -def prepare_negative_passages(data_dir, filename_negative, n_docs): - if n_docs == 0: - return [] - with open(data_dir / filename_negative) as f: - lines = [] - _ = f.readline() # Skip column titles line - for _ in range(n_docs): - lines.append(f.readline()[:-1]) - - docs = [] - for l in lines[:n_docs]: - id, text, title = l.split("\t") - d = {"text": text, - "meta": {"passage_id": int(id), - "title": title}} - d = Document(**d) - docs.append(d) - return docs - -def benchmark_indexing(): +def benchmark_indexing(n_docs_options, retriever_doc_stores, data_dir, filename_gold, filename_negative, data_s3_url, embeddings_filenames, embeddings_dir, **kwargs): retriever_results = [] for n_docs in n_docs_options: for retriever_name, doc_store_name in retriever_doc_stores: - doc_store = get_document_store(doc_store_name, es_similarity=es_similarity) + logger.info(f"##### Start indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ") + try: + doc_store = get_document_store(doc_store_name) + retriever = get_retriever(retriever_name, doc_store) + docs, _ = prepare_data(data_dir=data_dir, + filename_gold=filename_gold, + filename_negative=filename_negative, + data_s3_url=data_s3_url, + embeddings_filenames=embeddings_filenames, + embeddings_dir=embeddings_dir, + n_docs=n_docs) - retriever = get_retriever(retriever_name, doc_store) + tic = perf_counter() + index_to_doc_store(doc_store, docs, retriever) + toc = perf_counter() + indexing_time = toc - tic - docs, _ = prepare_data(data_dir, filename_gold, filename_negative, n_docs=n_docs) + print(indexing_time) - tic = perf_counter() - index_to_doc_store(doc_store, docs, retriever) - toc = perf_counter() - indexing_time = toc - tic + retriever_results.append({ + "retriever": retriever_name, + "doc_store": doc_store_name, + "n_docs": n_docs, + "indexing_time": indexing_time, + "docs_per_second": n_docs / indexing_time, + "date_time": datetime.datetime.now(), + "error": None}) + retriever_df = pd.DataFrame.from_records(retriever_results) + retriever_df = retriever_df.sort_values(by="retriever").sort_values(by="doc_store") + retriever_df.to_csv("retriever_index_results.csv") + doc_store.delete_all_documents(index=doc_index) + doc_store.delete_all_documents(index=label_index) + time.sleep(10) + del doc_store + del retriever - print(indexing_time) + except Exception as e: + tb = traceback.format_exc() + retriever_results.append({ + "retriever": retriever_name, + "doc_store": doc_store_name, + "n_docs": n_docs, + "indexing_time": indexing_time, + "docs_per_second": n_docs / indexing_time, + "date_time": datetime.datetime.now(), + "error": str(tb)}) + doc_store.delete_all_documents(index=doc_index) + doc_store.delete_all_documents(index=label_index) + time.sleep(10) + del doc_store + del retriever - retriever_results.append({ - "retriever": retriever_name, - "doc_store": doc_store_name, - "n_docs": n_docs, - "indexing_time": indexing_time, - "docs_per_second": n_docs / indexing_time, - "date_time": datetime.datetime.now()}) - retriever_df = pd.DataFrame.from_records(retriever_results) - retriever_df = retriever_df.sort_values(by="retriever").sort_values(by="doc_store") - retriever_df.to_csv("retriever_index_results.csv") - - doc_store.delete_all_documents(index=doc_index) - doc_store.delete_all_documents(index=label_index) - time.sleep(10) - del doc_store - del retriever - -def benchmark_querying(): +def benchmark_querying(n_docs_options, + retriever_doc_stores, + data_dir, + data_s3_url, + filename_gold, + filename_negative, + n_queries, + embeddings_filenames, + embeddings_dir, + **kwargs): """ Benchmark the time it takes to perform querying. Doc embeddings are loaded from file.""" retriever_results = [] + for n_docs in n_docs_options: for retriever_name, doc_store_name in retriever_doc_stores: try: - logger.info(f"##### Start run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ") - doc_store = get_document_store(doc_store_name, es_similarity=es_similarity) + logger.info(f"##### Start querying run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ") + doc_store = get_document_store(doc_store_name) retriever = get_retriever(retriever_name, doc_store) add_precomputed = retriever_name in ["dpr"] # For DPR, precomputed embeddings are loaded from file - docs, labels = prepare_data(data_dir, - filename_gold, - filename_negative, + docs, labels = prepare_data(data_dir=data_dir, + filename_gold=filename_gold, + filename_negative=filename_negative, + data_s3_url=data_s3_url, + embeddings_filenames=embeddings_filenames, + embeddings_dir=embeddings_dir, n_docs=n_docs, n_queries=n_queries, add_precomputed=add_precomputed) @@ -190,6 +155,10 @@ def benchmark_querying(): "date_time": datetime.datetime.now(), "error": str(tb) } + doc_store.delete_all_documents() + time.sleep(5) + del doc_store + del retriever logger.info(results) retriever_results.append(results) @@ -204,8 +173,8 @@ def add_precomputed_embeddings(embeddings_dir, embeddings_filenames, docs): ret = [] id_to_doc = {x.meta["passage_id"]: x for x in docs} for ef in embeddings_filenames: - logger.info(f"Adding precomputed embeddings from {embeddings_dir / ef}") - filename = embeddings_dir / ef + logger.info(f"Adding precomputed embeddings from {embeddings_dir + ef}") + filename = embeddings_dir + ef embeds = pickle.load(open(filename, "rb")) for i, vec in embeds: if int(i) in id_to_doc: @@ -219,6 +188,65 @@ def add_precomputed_embeddings(embeddings_dir, embeddings_filenames, docs): return ret +def prepare_data(data_dir, filename_gold, filename_negative, data_s3_url, embeddings_filenames, embeddings_dir, n_docs=None, n_queries=None, add_precomputed=False): + """ + filename_gold points to a squad format file. + filename_negative points to a csv file where the first column is doc_id and second is document text. + If add_precomputed is True, this fn will look in the embeddings files for precomputed embeddings to add to each Document + """ + + logging.getLogger("farm").setLevel(logging.INFO) + download_from_s3(data_s3_url + filename_gold, cache_dir=data_dir) + download_from_s3(data_s3_url + filename_negative, cache_dir=data_dir) + if add_precomputed: + for embedding_filename in embeddings_filenames: + download_from_s3(data_s3_url + str(embeddings_dir) + embedding_filename, cache_dir=data_dir) + logging.getLogger("farm").setLevel(logging.WARN) + + gold_docs, labels = eval_data_from_file(data_dir + filename_gold) + + # Reduce number of docs + gold_docs = gold_docs[:n_docs] + + # Remove labels whose gold docs have been removed + doc_ids = [x.id for x in gold_docs] + labels = [x for x in labels if x.document_id in doc_ids] + + # Filter labels down to n_queries + selected_queries = list(set(f"{x.document_id} | {x.question}" for x in labels)) + selected_queries = selected_queries[:n_queries] + labels = [x for x in labels if f"{x.document_id} | {x.question}" in selected_queries] + + n_neg_docs = max(0, n_docs - len(gold_docs)) + neg_docs = prepare_negative_passages(data_dir, filename_negative, n_neg_docs) + docs = gold_docs + neg_docs + + if add_precomputed: + docs = add_precomputed_embeddings(data_dir + embeddings_dir, embeddings_filenames, docs) + + return docs, labels + +def prepare_negative_passages(data_dir, filename_negative, n_docs): + if n_docs == 0: + return [] + with open(data_dir + filename_negative) as f: + lines = [] + _ = f.readline() # Skip column titles line + for _ in range(n_docs): + lines.append(f.readline()[:-1]) + + docs = [] + for l in lines[:n_docs]: + id, text, title = l.split("\t") + d = {"text": text, + "meta": {"passage_id": int(id), + "title": title}} + d = Document(**d) + docs.append(d) + return docs + + if __name__ == "__main__": - # benchmark_indexing() - benchmark_querying() + params, filenames = load_config(config_filename="config.json", ci=True) + benchmark_indexing(**params, **filenames) + benchmark_querying(**params, **filenames) diff --git a/test/benchmarks/run.py b/test/benchmarks/run.py index d318fd529..81aa6455a 100644 --- a/test/benchmarks/run.py +++ b/test/benchmarks/run.py @@ -1,7 +1,10 @@ from retriever import benchmark_indexing, benchmark_querying from reader import benchmark_reader +from utils import load_config import argparse +params, filenames = load_config(config_filename="config.json", ci=True) + parser = argparse.ArgumentParser() parser.add_argument('--reader', default=False, action="store_true", @@ -16,9 +19,9 @@ parser.add_argument('--ci', default=False, action="store_true", args = parser.parse_args() if args.retriever_index: - benchmark_indexing(ci) + benchmark_indexing(**params, **filenames, ci=args.ci) if args.retriever_query: - benchmark_querying(ci) + benchmark_querying(**params, **filenames, ci=args.ci) if args.retriever_reader: - benchmark_reader(ci) + benchmark_reader(**params, **filenames, ci=args.ci) diff --git a/test/benchmarks/utils.py b/test/benchmarks/utils.py index c8edf0dd7..8ea7c2ec7 100644 --- a/test/benchmarks/utils.py +++ b/test/benchmarks/utils.py @@ -104,3 +104,20 @@ def index_to_doc_store(doc_store, docs, retriever, labels=None): elif callable(getattr(retriever, "embed_passages", None)) and docs[0].embedding is None: doc_store.update_embeddings(retriever, index=doc_index) +def load_config(config_filename, ci): + conf = json.load(open(config_filename)) + if ci: + params = conf["params"]["ci"] + else: + params = conf["params"]["full"] + filenames = conf["filenames"] + max_docs = max(params["n_docs_options"]) + n_docs_keys = sorted([int(x) for x in list(filenames["embeddings_filenames"])]) + for k in n_docs_keys: + if max_docs <= k: + filenames["embeddings_filenames"] = [filenames["embeddings_filenames"][str(k)]] + filenames["filename_negative"] = filenames["filenames_negative"][str(k)] + break + return params, filenames + + From b9bb8d6cc178b37ba6ae40ddc63787f9320f382f Mon Sep 17 00:00:00 2001 From: brandenchan Date: Fri, 16 Oct 2020 12:16:32 +0200 Subject: [PATCH 2/9] Fix try except --- test/benchmarks/retriever.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/benchmarks/retriever.py b/test/benchmarks/retriever.py index 8c4acf820..2ec4055d6 100644 --- a/test/benchmarks/retriever.py +++ b/test/benchmarks/retriever.py @@ -75,8 +75,8 @@ def benchmark_indexing(n_docs_options, retriever_doc_stores, data_dir, filename_ "retriever": retriever_name, "doc_store": doc_store_name, "n_docs": n_docs, - "indexing_time": indexing_time, - "docs_per_second": n_docs / indexing_time, + "indexing_time": 0, + "docs_per_second": 0, "date_time": datetime.datetime.now(), "error": str(tb)}) doc_store.delete_all_documents(index=doc_index) From 11a397694517477841bfb9cac34460a83e39e46a Mon Sep 17 00:00:00 2001 From: Malte Pietsch Date: Mon, 19 Oct 2020 14:40:26 +0200 Subject: [PATCH 3/9] update deletes. fix arg in run.py --- test/benchmarks/retriever.py | 6 ++++-- test/benchmarks/run.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/test/benchmarks/retriever.py b/test/benchmarks/retriever.py index 2ec4055d6..ec91c46d4 100644 --- a/test/benchmarks/retriever.py +++ b/test/benchmarks/retriever.py @@ -135,7 +135,8 @@ def benchmark_querying(n_docs_options, "error": None } - doc_store.delete_all_documents() + doc_store.delete_all_documents(index=doc_index) + doc_store.delete_all_documents(index=label_index) time.sleep(5) del doc_store del retriever @@ -155,7 +156,8 @@ def benchmark_querying(n_docs_options, "date_time": datetime.datetime.now(), "error": str(tb) } - doc_store.delete_all_documents() + doc_store.delete_all_documents(index=doc_index) + doc_store.delete_all_documents(index=label_index) time.sleep(5) del doc_store del retriever diff --git a/test/benchmarks/run.py b/test/benchmarks/run.py index 81aa6455a..ad743f27e 100644 --- a/test/benchmarks/run.py +++ b/test/benchmarks/run.py @@ -22,6 +22,6 @@ if args.retriever_index: benchmark_indexing(**params, **filenames, ci=args.ci) if args.retriever_query: benchmark_querying(**params, **filenames, ci=args.ci) -if args.retriever_reader: +if args.reader: benchmark_reader(**params, **filenames, ci=args.ci) From 87e5f06fa88b0b7bd865f1bac90577fc59b4ad59 Mon Sep 17 00:00:00 2001 From: brandenchan Date: Wed, 21 Oct 2020 17:59:44 +0200 Subject: [PATCH 4/9] add automatic json update --- test/benchmarks/results_to_json.py | 7 ++----- test/benchmarks/templates.py | 2 +- test/benchmarks/utils.py | 1 + 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/test/benchmarks/results_to_json.py b/test/benchmarks/results_to_json.py index 4d1f74060..cbd7fe549 100644 --- a/test/benchmarks/results_to_json.py +++ b/test/benchmarks/results_to_json.py @@ -80,22 +80,19 @@ def retriever_map(df): columns = ["model", "n_docs", "map"] df = df[columns] ret = df.to_dict(orient="records") - return json.dumps(ret, indent=4) + return ret def retriever_speed(df): columns = ["model", "n_docs", "query_speed"] df = df[columns] ret = df.to_dict(orient="records") - return json.dumps(ret, indent=4) - + return ret def retriever_overview(df, chosen_n_docs=100_000): - df = df[df["n_docs"] == chosen_n_docs] ret = [dict(row) for i, row in df.iterrows()] - return ret diff --git a/test/benchmarks/templates.py b/test/benchmarks/templates.py index 4a9857f60..eab20aa10 100644 --- a/test/benchmarks/templates.py +++ b/test/benchmarks/templates.py @@ -20,7 +20,7 @@ RETRIEVER_TEMPLATE = { "bars": "horizontal", "columns": [ "Model", - "Recall", + "mAP", "Index Speed (docs/sec)", "Query Speed (queries/sec)" ], diff --git a/test/benchmarks/utils.py b/test/benchmarks/utils.py index bd28495c6..e370ff251 100644 --- a/test/benchmarks/utils.py +++ b/test/benchmarks/utils.py @@ -10,6 +10,7 @@ from haystack.reader.transformers import TransformersReader import logging import subprocess import time +import json from pathlib import Path logger = logging.getLogger(__name__) From b0483cfd9928236271aef3455e4dcbe98f0e106c Mon Sep 17 00:00:00 2001 From: Branden Chan Date: Thu, 22 Oct 2020 15:32:56 +0200 Subject: [PATCH 5/9] add readme --- test/benchmarks/README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 test/benchmarks/README.md diff --git a/test/benchmarks/README.md b/test/benchmarks/README.md new file mode 100644 index 000000000..ba06be2ad --- /dev/null +++ b/test/benchmarks/README.md @@ -0,0 +1,20 @@ +# Benchmarks + +Run the benchmarks with the following command: + +``` +python run.py [--reader] [--retriever_index] [--retriever_query] [--ci] [--update-json] +``` + +You can specify which components and processes to benchmark with the following flags. + +**--reader** will trigger the speed and accuracy benchmarks for the reader. Here we simply use the SQuAD dev set. + +**--retriever_index** will trigger indexing benchmarks + +**--retriever_query** will trigger querying benchmarks (embeddings will be loaded from file instead of being computed on the fly) + +**--ci** will cause the the benchmarks to run on a smaller slice of each dataset and a smaller subset of Retriever / Reader / DocStores. + +**--update-json** will cause the script to update the json files in docs/_src/benchmarks so that the website benchmarks will be updated. + \ No newline at end of file From fbacdfd263de117748b09e6bed18dc80fa388a6e Mon Sep 17 00:00:00 2001 From: Branden Chan Date: Thu, 22 Oct 2020 15:45:46 +0200 Subject: [PATCH 6/9] Add logging of error, add n_docs assert --- test/benchmarks/retriever.py | 4 ++++ test/benchmarks/utils.py | 1 + 2 files changed, 5 insertions(+) diff --git a/test/benchmarks/retriever.py b/test/benchmarks/retriever.py index 3a69ed20d..807acdc2e 100644 --- a/test/benchmarks/retriever.py +++ b/test/benchmarks/retriever.py @@ -79,6 +79,8 @@ def benchmark_indexing(n_docs_options, retriever_doc_stores, data_dir, filename_ except Exception as e: tb = traceback.format_exc() + logging.ERROR(f"##### The following Error was raised while running indexing run: {retriever_name}, {doc_store_name}, {n_docs} docs #####") + logging.Error(tb) retriever_results.append({ "retriever": retriever_name, "doc_store": doc_store_name, @@ -155,6 +157,8 @@ def benchmark_querying(n_docs_options, del retriever except Exception as e: tb = traceback.format_exc() + logging.ERROR(f"##### The following Error was raised while running querying run: {retriever_name}, {doc_store_name}, {n_docs} docs #####") + logging.Error(tb) results = { "retriever": retriever_name, "doc_store": doc_store_name, diff --git a/test/benchmarks/utils.py b/test/benchmarks/utils.py index e370ff251..ff8e3abaf 100644 --- a/test/benchmarks/utils.py +++ b/test/benchmarks/utils.py @@ -71,6 +71,7 @@ def get_document_store(document_store_type, es_similarity='cosine'): else: raise Exception(f"No document store fixture for '{document_store_type}'") + assert document_store.get_document_count() == 0 return document_store def get_retriever(retriever_name, doc_store): From 7c81dfdc3af276d9472bca761acf697cf1daed66 Mon Sep 17 00:00:00 2001 From: Branden Chan Date: Tue, 27 Oct 2020 12:41:11 +0100 Subject: [PATCH 7/9] Address reviewer comments --- docs/_src/benchmarks/reader_performance.json | 10 +++++----- haystack/preprocessor/utils.py | 3 ++- test/benchmarks/reader_results.csv | 6 +++++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/docs/_src/benchmarks/reader_performance.json b/docs/_src/benchmarks/reader_performance.json index fc8d6fea5..41595406f 100644 --- a/docs/_src/benchmarks/reader_performance.json +++ b/docs/_src/benchmarks/reader_performance.json @@ -10,10 +10,10 @@ "Speed (passages/sec)" ], "data": [ - { - "F1": 0.7384645069741224, - "Speed": 4239.284314014953, - "Model": "MiniLM" - } + {"F1": 80.67985794671885, "Model": "RoBERTa", "Speed": 92.3039712094936}, + {"F1": 78.23306265318686, "Model": "MiniLM", "Speed": 98.62387044489223}, + {"F1": 74.90271600053505, "Model": "BERT base", "Speed": 99.92750782409666}, + {"F1": 82.64545708097472, "Model": "BERT large", "Speed": 39.529824033964466}, + {"F1": 85.26275190954586, "Model": "XLM-RoBERTa", "Speed": 39.29142006004379} ] } \ No newline at end of file diff --git a/haystack/preprocessor/utils.py b/haystack/preprocessor/utils.py index 9f4c6851c..06a2ad407 100644 --- a/haystack/preprocessor/utils.py +++ b/haystack/preprocessor/utils.py @@ -19,12 +19,13 @@ from haystack.file_converter.txt import TextConverter logger = logging.getLogger(__name__) -def eval_data_from_file(filename: str, n_docs: Union[int, bool]=None) -> Tuple[List[Document], List[Label]]: +def eval_data_from_file(filename: str, max_docs: Union[int, bool]=None) -> Tuple[List[Document], List[Label]]: """ Read Documents + Labels from a SQuAD-style file. Document and Labels can then be indexed to the DocumentStore and be used for evaluation. :param filename: Path to file in SQuAD format + :param max_docs: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents. :return: (List of Documents, List of Labels) """ docs = [] diff --git a/test/benchmarks/reader_results.csv b/test/benchmarks/reader_results.csv index 5a03edcc1..5fc081050 100644 --- a/test/benchmarks/reader_results.csv +++ b/test/benchmarks/reader_results.csv @@ -1,2 +1,6 @@ ,EM,f1,top_n_accuracy,top_n,reader_time,seconds_per_query,passages_per_second,reader,error -0,0.7067307692307693,0.7384645069741224,0.9567307692307693,5,2.9132275839983777,0.014005901846146047,4239.2843140149525,deepset/minilm-uncased-squad2, +0,0.7589752233271532,0.8067985794671885,0.9671329849991572,5,133.79706027999998,0.011275666634080564,92.30397120949361,deepset/roberta-base-squad2, +1,0.7359683128265633,0.7823306265318686,0.9714309792684982,5,125.22323393199997,0.010553112584864317,98.62387044489225,deepset/minilm-uncased-squad2, +2,0.700825889094893,0.7490271600053505,0.9585369964604753,5,123.58959278499992,0.010415438461570867,99.92750782409666,deepset/bert-base-cased-squad2, +3,0.7821506826226192,0.8264545708097472,0.9762346199224675,5,312.42233685099995,0.026329204184308102,39.529824033964466,deepset/bert-large-uncased-whole-word-masking-squad2, +4,0.8099612337771785,0.8526275190954586,0.9772459126917242,5,314.3179854819998,0.026488958830439897,39.29142006004379,deepset/xlm-roberta-large-squad2, \ No newline at end of file From 8c4865ee5fe5cdb58ca031d7879498eb9db3eab9 Mon Sep 17 00:00:00 2001 From: Branden Chan Date: Tue, 27 Oct 2020 12:45:15 +0100 Subject: [PATCH 8/9] Rename n_docs variable to max_docs --- haystack/preprocessor/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/preprocessor/utils.py b/haystack/preprocessor/utils.py index 06a2ad407..685e4396b 100644 --- a/haystack/preprocessor/utils.py +++ b/haystack/preprocessor/utils.py @@ -35,7 +35,7 @@ def eval_data_from_file(filename: str, max_docs: Union[int, bool]=None) -> Tuple data = json.load(file) if "title" not in data["data"][0]: logger.warning(f"No title information found for documents in QA file: {filename}") - for document in data["data"][:n_docs]: + for document in data["data"][:max_docs]: # get all extra fields from document level (e.g. title) meta_doc = {k: v for k, v in document.items() if k not in ("paragraphs", "title")} for paragraph in document["paragraphs"]: From 18d315d61a756e9fed32a9c78ffaa5689bda001e Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Wed, 28 Oct 2020 09:55:31 +0100 Subject: [PATCH 9/9] Make returning predictions in evaluation possible (#524) * Make returning preds in evaluation possible * Make returning preds in evaluation possible * Add automated check if eval dict contains predictions --- haystack/finder.py | 30 +++++++++++++++++++++++++----- haystack/reader/farm.py | 2 +- haystack/retriever/base.py | 24 ++++++++++++++++++++++-- 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/haystack/finder.py b/haystack/finder.py index 4ead6911c..94a389dd9 100644 --- a/haystack/finder.py +++ b/haystack/finder.py @@ -121,6 +121,7 @@ class Finder: label_origin: str = "gold_label", top_k_retriever: int = 10, top_k_reader: int = 10, + return_preds: bool = False, ): """ Evaluation of the whole pipeline by first evaluating the Retriever and then evaluating the Reader on the result @@ -165,6 +166,9 @@ class Finder: :type top_k_retriever: int :param top_k_reader: How many answers to return per question :type top_k_reader: int + :param return_preds: Whether to add predictions in the returned dictionary. If True, the returned dictionary + contains the keys "predictions" and "metrics". + :type return_preds: bool """ if not self.reader or not self.retriever: @@ -205,6 +209,7 @@ class Finder: previous_return_no_answers = self.reader.return_no_answers self.reader.return_no_answers = True + predictions = [] # extract answers reader_start_time = time.time() for q_idx, question_docs in enumerate(questions_with_docs): @@ -215,8 +220,10 @@ class Finder: question_string = question.question docs = question_docs["docs"] # type: ignore single_reader_start = time.time() - predicted_answers = self.reader.predict(question_string, docs, top_k=top_k_reader) # type: ignore + predicted_answers = self.reader.predict(question_string, docs, top_k=top_k_reader) # type: ignore read_times.append(time.time() - single_reader_start) + if return_preds: + predictions.append(predicted_answers) counts = eval_counts_reader(question, predicted_answers, counts) counts["number_of_has_answer"] = counts["correct_retrievals"] - counts["number_of_no_answer"] @@ -240,7 +247,10 @@ class Finder: eval_results["avg_reader_time"] = mean(read_times) eval_results["total_finder_time"] = finder_total_time - return eval_results + if return_preds: + return {"metrics": eval_results, "predictions": predictions} + else: + return eval_results def eval_batch( self, @@ -249,7 +259,8 @@ class Finder: label_origin: str = "gold_label", top_k_retriever: int = 10, top_k_reader: int = 10, - batch_size: int = 50 + batch_size: int = 50, + return_preds: bool = False, ): """ Evaluation of the whole pipeline by first evaluating the Retriever and then evaluating the Reader on the result @@ -296,10 +307,13 @@ class Finder: :type top_k_reader: int :param batch_size: Number of samples per batch computed at once :type batch_size: int + :param return_preds: Whether to add predictions in the returned dictionary. If True, the returned dictionary + contains the keys "predictions" and "metrics". + :type return_preds: bool """ if not self.reader or not self.retriever: - raise Exception("Finder needs to have a reader and retriever for the evalutaion.") + raise Exception("Finder needs to have a reader and retriever for the evaluation.") counts = defaultdict(float) # type: Dict[str, float] finder_start_time = time.time() @@ -344,7 +358,10 @@ class Finder: logger.info(f"{number_of_questions - correct_retrievals} questions could not be answered due to the retriever.") logger.info(f"{correct_retrievals - counts['correct_readings_topk']} questions could not be answered due to the reader.") - return results + if return_preds: + return {"metrics": results, "predictions": predictions} + else: + return results def _retrieve_docs(self, questions: List[MultiLabel], top_k: int, doc_index: str): @@ -364,6 +381,9 @@ class Finder: @staticmethod def print_eval_results(finder_eval_results: Dict): + if "predictions" in finder_eval_results.keys(): + finder_eval_results = finder_eval_results["metrics"] + print("\n___Retriever Metrics in Finder___") print(f"Retriever Recall : {finder_eval_results['retriever_recall']:.3f}") print(f"Retriever Mean Avg Precision: {finder_eval_results['retriever_map']:.3f}") diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py index 36047460b..6c2bf7e90 100644 --- a/haystack/reader/farm.py +++ b/haystack/reader/farm.py @@ -291,7 +291,7 @@ class FARMReader(BaseReader): result = [] for idx, group in enumerate(grouped_predictions): answers, max_no_ans_gap = self._extract_answers_of_predictions(group, top_k_per_question) - question = group[0] + question = group[0].question cur_label = labels[idx] result.append({ "question": question, diff --git a/haystack/retriever/base.py b/haystack/retriever/base.py index e73492f8f..21e97b449 100644 --- a/haystack/retriever/base.py +++ b/haystack/retriever/base.py @@ -45,7 +45,8 @@ class BaseRetriever(ABC): doc_index: str = "eval_document", label_origin: str = "gold_label", top_k: int = 10, - open_domain: bool = False + open_domain: bool = False, + return_preds: bool = False, ) -> dict: """ Performs evaluation on the Retriever. @@ -65,6 +66,8 @@ class BaseRetriever(ABC): contained in the retrieved docs (common approach in open-domain QA). If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids are within ids explicitly stated in the labels. + :param return_preds: Whether to add predictions in the returned dictionary. If True, the returned dictionary + contains the keys "predictions" and "metrics". """ # Extract all questions for evaluation @@ -86,11 +89,15 @@ class BaseRetriever(ABC): deduplicated_doc_ids = list(set([str(x) for x in label.multiple_document_ids])) question_label_dict[label.question] = deduplicated_doc_ids + predictions = [] + # Option 1: Open-domain evaluation by checking if the answer string is in the retrieved docs logger.info("Performing eval queries...") if open_domain: for question, gold_answers in tqdm(question_label_dict.items()): retrieved_docs = timed_retrieve(question, top_k=top_k, index=doc_index) + if return_preds: + predictions.append({"question": question, "retrieved_docs": retrieved_docs}) # check if correct doc in retrieved docs for doc_idx, doc in enumerate(retrieved_docs): for gold_answer in gold_answers: @@ -102,6 +109,8 @@ class BaseRetriever(ABC): else: for question, gold_ids in tqdm(question_label_dict.items()): retrieved_docs = timed_retrieve(question, top_k=top_k, index=doc_index) + if return_preds: + predictions.append({"question": question, "retrieved_docs": retrieved_docs}) # check if correct doc in retrieved docs for doc_idx, doc in enumerate(retrieved_docs): for gold_id in gold_ids: @@ -117,4 +126,15 @@ class BaseRetriever(ABC): logger.info((f"For {correct_retrievals} out of {number_of_questions} questions ({recall:.2%}), the answer was in" f" the top-{top_k} candidate passages selected by the retriever.")) - return {"recall": recall, "map": mean_avg_precision, "retrieve_time": self.retrieve_time, "n_questions": number_of_questions, "top_k": top_k} \ No newline at end of file + metrics = { + "recall": recall, + "map": mean_avg_precision, + "retrieve_time": self.retrieve_time, + "n_questions": number_of_questions, + "top_k": top_k + } + + if return_preds: + return {"metrics": metrics, "predictions": predictions} + else: + return metrics \ No newline at end of file