diff --git a/docs/_src/benchmarks/reader_performance.json b/docs/_src/benchmarks/reader_performance.json index 24209b35b..be935fe27 100644 --- a/docs/_src/benchmarks/reader_performance.json +++ b/docs/_src/benchmarks/reader_performance.json @@ -11,33 +11,33 @@ ], "data": [ { - "F1": 82.62983412843887, - "Speed": 98.86638639776464, + "F1": 82.58860575299658, + "Speed": 125.81040525892848, "Model": "RoBERTa" }, { - "F1": 78.90026641413856, - "Speed": 181.96379531485616, + "F1": 78.87858491007042, + "Speed": 260.6443097981493, "Model": "MiniLM" }, { - "F1": 74.32668866064459, - "Speed": 106.04748306200683, + "F1": 74.31182400443286, + "Speed": 121.08066567525722, "Model": "BERT base" }, { - "F1": 83.29492827667042, - "Speed": 40.408497243719076, + "F1": 83.26306774734308, + "Speed": 42.21949937744112, "Model": "BERT large" }, { - "F1": 84.62174414643722, - "Speed": 40.483264542292716, + "F1": 84.50422699207468, + "Speed": 42.07400844838985, "Model": "XLM-RoBERTa" }, { - "F1": 42.342513261953935, - "Speed": 160.41712955027901, + "F1": 42.31925844723574, + "Speed": 222.91207128366702, "Model": "DistilBERT" } ] diff --git a/docs/_src/benchmarks/retriever_map.json b/docs/_src/benchmarks/retriever_map.json index 8584e7065..65683484c 100644 --- a/docs/_src/benchmarks/retriever_map.json +++ b/docs/_src/benchmarks/retriever_map.json @@ -20,82 +20,82 @@ { "model": "DPR / ElasticSearch", "n_docs": 1000, - "map": 0.929 + "map": 92.95105322830888 }, { "model": "DPR / ElasticSearch", "n_docs": 10000, - "map": 0.881 + "map": 89.87097014904354 }, { "model": "DPR / ElasticSearch", "n_docs": 100000, - "map": 0.821 + "map": 86.54564090434241 }, { "model": "DPR / ElasticSearch", "n_docs": 500000, - "map": 0.730 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 1000, - "map": 0.929 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 10000, - "map": 0.898 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 100000, - "map": 0.863 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 500000, - "map": 0.805 + "map": 80.86137228234089 }, { "model": "BM25 / ElasticSearch", "n_docs": 1000, - "map": 0.748 + "map": 74.20444712972909 }, { "model": "BM25 / ElasticSearch", "n_docs": 10000, - "map": 0.6609999999999999 + "map": 66.20627317806674 }, { "model": "BM25 / ElasticSearch", "n_docs": 100000, - "map": 0.56 + "map": 56.25959153101251 }, { "model": "BM25 / ElasticSearch", "n_docs": 500000, - "map": 0.452 + "map": 45.59452709000341 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 1000, + "map": 92.95105322830888 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 10000, + "map": 89.87097014904354 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "map": 86.54606328368972 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 500000, + "map": 80.8613722823409 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 1000, - "map": 0.929 + "map": 92.95105322830888 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 10000, - "map": 0.896 + "map": 89.69941373746582 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 100000, - "map": 0.849 + "map": 85.07984377595874 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 500000, - "map": 0.766 + "map": 76.91475821598232 } ] } \ No newline at end of file diff --git a/docs/_src/benchmarks/retriever_performance.json b/docs/_src/benchmarks/retriever_performance.json index f4985dade..d5075ca3c 100644 --- a/docs/_src/benchmarks/retriever_performance.json +++ b/docs/_src/benchmarks/retriever_performance.json @@ -2,7 +2,7 @@ "chart_type": "BarChart", "title": "Retriever Performance", "subtitle": "Time and Accuracy Benchmarks", - "description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. Indexing speed (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. Querying speed (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from here)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the \"facebook/dpr-question_encoder-single-nq-base\" and \"facebook/dpr-ctx_encoder-single-nq-base\" models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use n_links=128, efSearch=20 and efConstruction=80. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.", + "description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. Indexing speed (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. Querying speed (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from here)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the \"facebook/dpr-question_encoder-single-nq-base\" and \"facebook/dpr-ctx_encoder-single-nq-base\" models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use n_links=128, efSearch=20 and efConstruction=80. We use a cosine similarity function with BM25 retrievers, and dot product with DPR. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.", "bars": "horizontal", "columns": [ "Model", @@ -24,30 +24,30 @@ { "model": "DPR / ElasticSearch", "n_docs": 100000, - "index_speed": 73.93635160290218, - "query_speed": 6.23, - "map": 82 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 100000, - "index_speed": 104.77116699738369, - "query_speed": 4.89, - "map": 86.3 + "index_speed": 69.75508852811794, + "query_speed": 4.5992769354707805, + "map": 86.54564090434241 }, { "model": "BM25 / ElasticSearch", "n_docs": 100000, - "index_speed": 484.32931514144724, - "query_speed": 162.59, - "map": 56 + "index_speed": 482.9993330442806, + "query_speed": 162.42378943468643, + "map": 56.25959153101251 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "index_speed": 95.52108545730724, + "query_speed": 6.511162294559942, + "map": 86.54606328368972 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 100000, - "index_speed": 91.41086878008392, - "query_speed": 12.85, - "map": 84.9 + "index_speed": 84.11829911061136, + "query_speed": 33.65729082116796, + "map": 85.07984377595874 } ] } \ No newline at end of file diff --git a/docs/_src/benchmarks/retriever_speed.json b/docs/_src/benchmarks/retriever_speed.json index 8c68cea84..e099421c7 100644 --- a/docs/_src/benchmarks/retriever_speed.json +++ b/docs/_src/benchmarks/retriever_speed.json @@ -20,82 +20,82 @@ { "model": "DPR / ElasticSearch", "n_docs": 1000, - "query_speed": 40.802 + "query_speed": 30.68451185154913 }, { "model": "DPR / ElasticSearch", "n_docs": 10000, - "query_speed": 24.8 + "query_speed": 19.568754413737462 }, { "model": "DPR / ElasticSearch", "n_docs": 100000, - "query_speed": 6.23 + "query_speed": 4.5992769354707805 }, { "model": "DPR / ElasticSearch", "n_docs": 500000, - "query_speed": 1.45 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 1000, - "query_speed": 40.048 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 10000, - "query_speed": 22.47 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 100000, - "query_speed": 4.90 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 500000, - "query_speed": 1.08 + "query_speed": 1.0558140319761546 }, { "model": "BM25 / ElasticSearch", "n_docs": 1000, - "query_speed": 232.97799999999998 + "query_speed": 262.9405144288997 }, { "model": "BM25 / ElasticSearch", "n_docs": 10000, - "query_speed": 248.97 + "query_speed": 183.6070813438718 }, { "model": "BM25 / ElasticSearch", "n_docs": 100000, - "query_speed": 162.59 + "query_speed": 162.42378943468643 }, { "model": "BM25 / ElasticSearch", "n_docs": 500000, - "query_speed": 91.39 + "query_speed": 82.43179203331141 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 1000, + "query_speed": 35.40380445859966 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 10000, + "query_speed": 25.78749025294445 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "query_speed": 6.511162294559942 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 500000, + "query_speed": 1.5161593755666505 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 1000, - "query_speed": 37.884 + "query_speed": 39.16414272911727 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 10000, - "query_speed": 31.34 + "query_speed": 33.6432023480111 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 100000, - "query_speed": 12.85 + "query_speed": 33.65729082116796 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 500000, - "query_speed": 3.32 + "query_speed": 34.27671486454735 } ] } \ No newline at end of file diff --git a/docs/v0.7.0/_src/benchmarks/reader_performance.json b/docs/v0.7.0/_src/benchmarks/reader_performance.json index 24209b35b..a6264ed64 100644 --- a/docs/v0.7.0/_src/benchmarks/reader_performance.json +++ b/docs/v0.7.0/_src/benchmarks/reader_performance.json @@ -11,33 +11,33 @@ ], "data": [ { - "F1": 82.62983412843887, - "Speed": 98.86638639776464, + "F1": 82.58860575299658, + "Speed": 128.25544874114064, "Model": "RoBERTa" }, { - "F1": 78.90026641413856, - "Speed": 181.96379531485616, + "F1": 78.87858491007042, + "Speed": 269.33155450679567, "Model": "MiniLM" }, { - "F1": 74.32668866064459, - "Speed": 106.04748306200683, + "F1": 74.31182400443286, + "Speed": 123.82266420208393, "Model": "BERT base" }, { - "F1": 83.29492827667042, - "Speed": 40.408497243719076, + "F1": 83.26306774734308, + "Speed": 43.188105620245494, "Model": "BERT large" }, { - "F1": 84.62174414643722, - "Speed": 40.483264542292716, + "F1": 84.50422699207468, + "Speed": 42.956527893643, "Model": "XLM-RoBERTa" }, { - "F1": 42.342513261953935, - "Speed": 160.41712955027901, + "F1": 42.31925844723574, + "Speed": 226.281948654048, "Model": "DistilBERT" } ] diff --git a/docs/v0.7.0/_src/benchmarks/retriever_map.json b/docs/v0.7.0/_src/benchmarks/retriever_map.json index 8584e7065..8379f5fc1 100644 --- a/docs/v0.7.0/_src/benchmarks/retriever_map.json +++ b/docs/v0.7.0/_src/benchmarks/retriever_map.json @@ -17,85 +17,65 @@ } ], "data": [ - { - "model": "DPR / ElasticSearch", - "n_docs": 1000, - "map": 0.929 - }, { "model": "DPR / ElasticSearch", "n_docs": 10000, - "map": 0.881 + "map": 88.26183154948457 }, { "model": "DPR / ElasticSearch", "n_docs": 100000, - "map": 0.821 + "map": 82.47044752499787 }, { "model": "DPR / ElasticSearch", "n_docs": 500000, - "map": 0.730 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 1000, - "map": 0.929 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 10000, - "map": 0.898 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 100000, - "map": 0.863 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 500000, - "map": 0.805 - }, - { - "model": "BM25 / ElasticSearch", - "n_docs": 1000, - "map": 0.748 + "map": 73.4952735751035 }, { "model": "BM25 / ElasticSearch", "n_docs": 10000, - "map": 0.6609999999999999 + "map": 66.33019927857616 }, { "model": "BM25 / ElasticSearch", "n_docs": 100000, - "map": 0.56 + "map": 56.25959153101251 }, { "model": "BM25 / ElasticSearch", "n_docs": 500000, - "map": 0.452 + "map": 45.60339705629754 }, { - "model": "DPR / FAISS (HSNW)", - "n_docs": 1000, - "map": 0.929 + "model": "DPR / FAISS (flat)", + "n_docs": 10000, + "map": 89.87097014904354 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "map": 86.54606328368972 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 500000, + "map": 80.8613722823409 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 10000, - "map": 0.896 + "map": 89.69941373746582 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 100000, - "map": 0.849 + "map": 85.07984377595874 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 500000, - "map": 0.766 + "map": 76.91475821598232 } ] } \ No newline at end of file diff --git a/docs/v0.7.0/_src/benchmarks/retriever_performance.json b/docs/v0.7.0/_src/benchmarks/retriever_performance.json index f4985dade..08b6dcadf 100644 --- a/docs/v0.7.0/_src/benchmarks/retriever_performance.json +++ b/docs/v0.7.0/_src/benchmarks/retriever_performance.json @@ -24,30 +24,30 @@ { "model": "DPR / ElasticSearch", "n_docs": 100000, - "index_speed": 73.93635160290218, - "query_speed": 6.23, - "map": 82 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 100000, - "index_speed": 104.77116699738369, - "query_speed": 4.89, - "map": 86.3 + "index_speed": 70.7842323095542, + "query_speed": 6.108417678791537, + "map": 82.47044752499787 }, { "model": "BM25 / ElasticSearch", "n_docs": 100000, - "index_speed": 484.32931514144724, - "query_speed": 162.59, - "map": 56 + "index_speed": 486.8274411916137, + "query_speed": 162.40717155994315, + "map": 56.25959153101251 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "index_speed": 98.76191471019415, + "query_speed": 6.614453113633132, + "map": 86.54606328368972 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 100000, - "index_speed": 91.41086878008392, - "query_speed": 12.85, - "map": 84.9 + "index_speed": 86.8695584169603, + "query_speed": 38.24323466239034, + "map": 85.07984377595874 } ] } \ No newline at end of file diff --git a/docs/v0.7.0/_src/benchmarks/retriever_speed.json b/docs/v0.7.0/_src/benchmarks/retriever_speed.json index 8c68cea84..0c599ceb8 100644 --- a/docs/v0.7.0/_src/benchmarks/retriever_speed.json +++ b/docs/v0.7.0/_src/benchmarks/retriever_speed.json @@ -17,85 +17,65 @@ } ], "data": [ - { - "model": "DPR / ElasticSearch", - "n_docs": 1000, - "query_speed": 40.802 - }, { "model": "DPR / ElasticSearch", "n_docs": 10000, - "query_speed": 24.8 + "query_speed": 23.3662850838307 }, { "model": "DPR / ElasticSearch", "n_docs": 100000, - "query_speed": 6.23 + "query_speed": 6.108417678791537 }, { "model": "DPR / ElasticSearch", "n_docs": 500000, - "query_speed": 1.45 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 1000, - "query_speed": 40.048 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 10000, - "query_speed": 22.47 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 100000, - "query_speed": 4.90 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 500000, - "query_speed": 1.08 - }, - { - "model": "BM25 / ElasticSearch", - "n_docs": 1000, - "query_speed": 232.97799999999998 + "query_speed": 1.4393100251286972 }, { "model": "BM25 / ElasticSearch", "n_docs": 10000, - "query_speed": 248.97 + "query_speed": 244.5160214986071 }, { "model": "BM25 / ElasticSearch", "n_docs": 100000, - "query_speed": 162.59 + "query_speed": 162.40717155994315 }, { "model": "BM25 / ElasticSearch", "n_docs": 500000, - "query_speed": 91.39 + "query_speed": 88.52692529827672 }, { - "model": "DPR / FAISS (HSNW)", - "n_docs": 1000, - "query_speed": 37.884 + "model": "DPR / FAISS (flat)", + "n_docs": 10000, + "query_speed": 27.22085301792775 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "query_speed": 6.614453113633132 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 500000, + "query_speed": 1.5222363376940002 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 10000, - "query_speed": 31.34 + "query_speed": 39.903073511580295 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 100000, - "query_speed": 12.85 + "query_speed": 38.24323466239034 }, { "model": "DPR / FAISS (HSNW)", "n_docs": 500000, - "query_speed": 3.32 + "query_speed": 37.13917579922844 } ] } \ No newline at end of file diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py index 2fc0f6847..c8a6e665b 100644 --- a/haystack/reader/farm.py +++ b/haystack/reader/farm.py @@ -524,9 +524,9 @@ class FARMReader(BaseReader): toc = perf_counter() reader_time = toc - tic results = { - "EM": eval_results[0]["EM"], - "f1": eval_results[0]["f1"], - "top_n_accuracy": eval_results[0]["top_n_accuracy"], + "EM": eval_results[0]["EM"] * 100, + "f1": eval_results[0]["f1"] * 100, + "top_n_accuracy": eval_results[0]["top_n_accuracy"] * 100, "top_n": self.inferencer.model.prediction_heads[0].n_best, "reader_time": reader_time, "seconds_per_query": reader_time / n_queries diff --git a/haystack/retriever/dense.py b/haystack/retriever/dense.py index e357859b8..fe5d207cc 100644 --- a/haystack/retriever/dense.py +++ b/haystack/retriever/dense.py @@ -187,8 +187,7 @@ class DensePassageRetriever(BaseRetriever): :return: dictionary of embeddings for "passages" and "query" """ - - dataset, tensor_names, problematic_ids, baskets = self.processor.dataset_from_dicts( + dataset, tensor_names, _, baskets = self.processor.dataset_from_dicts( dicts, indices=[i for i in range(len(dicts))], return_baskets=True ) diff --git a/test/benchmarks/config.json b/test/benchmarks/config.json index c352fbcad..77d03eb4d 100644 --- a/test/benchmarks/config.json +++ b/test/benchmarks/config.json @@ -20,6 +20,7 @@ ] ], "n_docs_options": [ + 1000, 10000, 100000, 500000 diff --git a/test/benchmarks/reader.py b/test/benchmarks/reader.py index 41308a106..9ada5a748 100644 --- a/test/benchmarks/reader.py +++ b/test/benchmarks/reader.py @@ -1,5 +1,5 @@ from utils import get_document_store, index_to_doc_store, get_reader -from haystack.preprocessor.utils import eval_data_from_file +from haystack.preprocessor.utils import eval_data_from_json from farm.data_handler.utils import _download_extract_downstream_data from pathlib import Path @@ -40,7 +40,7 @@ def benchmark_reader(ci=False, update_json=False, save_markdown=False, **kwargs) doc_store = get_document_store("elasticsearch") # download squad data _download_extract_downstream_data(input_file=data_dir/filename) - docs, labels = eval_data_from_file(data_dir/filename, max_docs=None) + docs, labels = eval_data_from_json(data_dir/filename, max_docs=None) index_to_doc_store(doc_store, docs, None, labels) for reader_name in reader_models: diff --git a/test/benchmarks/reader_results.csv b/test/benchmarks/reader_results.csv index 9b9411a83..9ef56e9bb 100644 --- a/test/benchmarks/reader_results.csv +++ b/test/benchmarks/reader_results.csv @@ -1,7 +1,7 @@ ,EM,f1,top_n_accuracy,top_n,reader_time,seconds_per_query,passages_per_second,reader,error -0,0.7836676217765043,0.8262983412843887,0.9742963087813922,5,124.91606550999859,0.01052722615118815,98.86638639776463,deepset/roberta-base-squad2, -1,0.7439743805831789,0.7890026641413856,0.9720209000505646,5,67.87064415001078,0.005719757639475036,181.96379531485616,deepset/minilm-uncased-squad2, -2,0.6947581324793528,0.7432668866064459,0.9557559413450194,5,116.45726653200109,0.009814365964267747,106.04748306200683,deepset/bert-base-cased-squad2, -3,0.7900724759817968,0.8329492827667042,0.976908815101972,5,305.62878707199707,0.02575668187021718,40.40849724371908,deepset/bert-large-uncased-whole-word-masking-squad2, -4,0.803472105174448,0.846217441464372,0.9742120343839542,5,305.06433064700104,0.025709112645120602,40.48326454229272,deepset/xlm-roberta-large-squad2, -5,0.3730827574582842,0.42342513261953935,0.9539019046013821,5,76.98679084099422,0.006488015408814615,160.417129550279,distilbert-base-uncased-distilled-squad, +0,0.7839204449688185,0.8258860575299658,0.9742120343839542,5,98.16358173700064,0.008272676701247315,125.81040525892847,deepset/roberta-base-squad2, +1,0.7438058317883027,0.7887858491007042,0.9719366256531266,5,47.38258053499885,0.003993138423647299,260.6443097981493,deepset/minilm-uncased-squad2, +2,0.6947581324793528,0.7431182400443286,0.9557559413450194,5,101.99811779300217,0.008595829916821352,121.08066567525722,deepset/bert-base-cased-squad2, +3,0.7897353783920446,0.8326306774734308,0.976908815101972,5,292.51886408200517,0.024651851009776266,42.21949937744112,deepset/bert-large-uncased-whole-word-masking-squad2, +4,0.8021237148154391,0.8450422699207468,0.974043485589078,5,293.53038741600176,0.024737096529243364,42.07400844838984,deepset/xlm-roberta-large-squad2, +5,0.3729984830608461,0.4231925844723574,0.9539019046013821,5,55.403011280999635,0.004669055391960192,222.91207128366705,distilbert-base-uncased-distilled-squad, diff --git a/test/benchmarks/retriever.py b/test/benchmarks/retriever.py index f871dfaca..f71297b69 100644 --- a/test/benchmarks/retriever.py +++ b/test/benchmarks/retriever.py @@ -2,7 +2,7 @@ import pandas as pd from pathlib import Path from time import perf_counter from utils import get_document_store, get_retriever, index_to_doc_store, load_config -from haystack.preprocessor.utils import eval_data_from_file +from haystack.preprocessor.utils import eval_data_from_json from haystack.document_store.faiss import FAISSDocumentStore from haystack import Document @@ -134,7 +134,11 @@ def benchmark_querying(n_docs_options, for retriever_name, doc_store_name in retriever_doc_stores: try: logger.info(f"##### Start querying run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ") - doc_store = get_document_store(doc_store_name) + if retriever_name == "elastic": + similarity = "cosine" + else: + similarity = "dot_product" + doc_store = get_document_store(doc_store_name, similarity=similarity) retriever = get_retriever(retriever_name, doc_store) add_precomputed = retriever_name in ["dpr"] # For DPR, precomputed embeddings are loaded from file @@ -160,8 +164,8 @@ def benchmark_querying(n_docs_options, "retrieve_time": raw_results["retrieve_time"], "queries_per_second": raw_results["n_questions"] / raw_results["retrieve_time"], "seconds_per_query": raw_results["retrieve_time"]/ raw_results["n_questions"], - "recall": raw_results["recall"], - "map": raw_results["map"], + "recall": raw_results["recall"] * 100, + "map": raw_results["map"] * 100, "top_k": raw_results["top_k"], "date_time": datetime.datetime.now(), "error": None @@ -265,7 +269,7 @@ def prepare_data(data_dir, filename_gold, filename_negative, data_s3_url, embed download_from_s3(data_s3_url + str(embeddings_dir) + embedding_filename, cache_dir=data_dir) logging.getLogger("farm").setLevel(logging.WARN) - gold_docs, labels = eval_data_from_file(data_dir + filename_gold) + gold_docs, labels = eval_data_from_json(data_dir + filename_gold) # Reduce number of docs gold_docs = gold_docs[:n_docs] diff --git a/test/benchmarks/retriever_index_results.csv b/test/benchmarks/retriever_index_results.csv index 1a9253e30..b587e10b4 100644 --- a/test/benchmarks/retriever_index_results.csv +++ b/test/benchmarks/retriever_index_results.csv @@ -1,13 +1,17 @@ ,retriever,doc_store,n_docs,indexing_time,docs_per_second,date_time,error -1,dpr,elasticsearch,10000,135.8048727600035,73.63506033890373,2020-12-02 06:51:48.587178, -5,dpr,elasticsearch,100000,1352.514667440999,73.93635160290218,2020-12-02 07:23:04.264694, -9,dpr,elasticsearch,500000,6781.024389943996,73.7351720400064,2020-12-02 10:10:42.147031, -0,elastic,elasticsearch,10000,20.694342684997537,483.223852635317,2020-12-02 06:49:00.317977, -4,elastic,elasticsearch,100000,206.47108666299755,484.32931514144724,2020-12-02 06:59:54.055199, -8,elastic,elasticsearch,500000,1032.1480222880054,484.4266415311529,2020-12-02 08:16:15.828533, -2,dpr,faiss_flat,10000,95.10171413100034,105.15057579535569,2020-12-02 06:53:59.472952, -6,dpr,faiss_flat,100000,954.4610684969957,104.77116699738367,2020-12-02 07:39:56.194345, -10,dpr,faiss_flat,500000,4865.149988802004,102.77175444761984,2020-12-02 11:34:34.726687, -3,dpr,faiss_hnsw,10000,103.25490099400486,96.84770314757859,2020-12-02 06:56:14.230579, -7,dpr,faiss_hnsw,100000,1093.9618158599915,91.41086878008392,2020-12-02 07:58:43.508489, -11,dpr,faiss_hnsw,500000,5784.850161597002,86.43266221816312,2020-12-02 13:11:43.328380, +1,dpr,elasticsearch,1000,15.336494209999728,65.20394989279743,2021-01-29 11:18:25.436371, +5,dpr,elasticsearch,10000,144.0823780490009,69.40474008972218,2021-01-29 11:23:19.896920, +9,dpr,elasticsearch,100000,1433.587170629,69.75508852811794,2021-01-29 11:56:22.502185, +13,dpr,elasticsearch,500000,7196.396471723998,69.47921810097519,2021-01-29 14:54:08.769187, +0,elastic,elasticsearch,1000,2.1182381880007597,472.0904408506686,2021-01-29 11:17:14.160560, +4,elastic,elasticsearch,10000,20.23965223199957,494.0796356268248,2021-01-29 11:20:27.378846, +8,elastic,elasticsearch,100000,207.03962336699988,482.9993330442806,2021-01-29 11:31:50.829072, +12,elastic,elasticsearch,500000,1029.1638562459993,485.8312862091863,2021-01-29 12:52:45.994426, +2,dpr,faiss_flat,1000,9.899907313998483,101.01104669798258,2021-01-29 11:19:11.304749, +6,dpr,faiss_flat,10000,104.1660261320012,96.00059032037764,2021-01-29 11:25:43.069491, +10,dpr,faiss_flat,100000,1046.8892760299987,95.52108545730724,2021-01-29 12:14:51.105055, +14,dpr,faiss_flat,500000,5243.775349973999,95.35114810028603,2021-01-29 16:24:19.855339, +3,dpr,faiss_hnsw,1000,10.329135104999295,96.81352696374361,2021-01-29 11:19:55.337391, +7,dpr,faiss_hnsw,10000,112.53792207699917,88.85893586304122,2021-01-29 11:28:10.284866, +11,dpr,faiss_hnsw,100000,1188.8019736170008,84.11829911061136,2021-01-29 12:35:16.166263, +15,dpr,faiss_hnsw,500000,6125.295488232001,81.62871504903015,2021-01-29 18:07:08.100722, diff --git a/test/benchmarks/retriever_query_results.csv b/test/benchmarks/retriever_query_results.csv index c5759ff1a..106ab88ee 100644 --- a/test/benchmarks/retriever_query_results.csv +++ b/test/benchmarks/retriever_query_results.csv @@ -1,13 +1,17 @@ ,retriever,doc_store,n_docs,n_queries,retrieve_time,queries_per_second,seconds_per_query,recall,map,top_k,date_time,error -1,dpr,elasticsearch,10000,5791,233.54168710688828,24.796429587106445,0.040328386652890395,0.9690899671904679,0.8808447974826822,10,2020-12-02 13:18:27.808539, -5,dpr,elasticsearch,100000,5791,928.9148432369257,6.234155953220104,0.1604066384453334,0.9397340701087895,0.8212235461156204,10,2020-12-02 13:53:44.689757, -9,dpr,elasticsearch,500000,5791,3992.798643678747,1.45036114184423,0.6894834473629333,0.8919012260404076,0.7302081363253893,10,2020-12-02 17:35:25.795083, -0,elastic,elasticsearch,10000,5791,23.260322959773475,248.9647289083211,0.00401663321702184,0.8103954412018649,0.6609973604361457,10,2020-12-02 13:13:03.957613, -4,elastic,elasticsearch,100000,5791,35.61682877641579,162.59167924109505,0.006150376234918976,0.7168019340355725,0.559593430418849,10,2020-12-02 13:33:30.417021, -8,elastic,elasticsearch,500000,5791,63.36918604133825,91.38510941614904,0.010942701785760362,0.6238991538594371,0.45245893326535686,10,2020-12-02 16:08:13.070376, -2,dpr,faiss_flat,10000,5791,257.67369354520633,22.474160712040344,0.044495543696288435,0.9746157831117251,0.8978985590667505,10,2020-12-02 13:23:51.002905, -6,dpr,faiss_flat,100000,5791,1182.7107160334417,4.896379073508164,0.2042325532780939,0.9575202901053359,0.8630120493486063,10,2020-12-02 14:18:14.837806, -3,dpr,faiss_hnsw,10000,5791,184.7552210999711,31.34417509568776,0.03190385444655001,0.972198238646175,0.8961883245210815,10,2020-12-02 13:28:33.415220, -7,dpr,faiss_hnsw,100000,5791,450.7693457186833,12.84692505158515,0.0778396383558424,0.9399067518563288,0.8486882354392283,10,2020-12-02 15:10:44.114148, -8,dpr,faiss_flat,500000,5791,5365.806154628852,1.0792413727067556,0.9265767837383616,0.9295458470039717,0.8045832613826054,10,2020-12-02 23:14:44.503864, -9,dpr,faiss_hnsw,500000,5791,1745.922715222303,3.3168707580865915,0.30148898553312087,0.8820583664306683,0.765677378416975,10,2020-12-03 00:18:53.376265, +1,dpr,elasticsearch,1000,1064,34.67547423102587,30.684511851549132,0.0325897314201371,0.9915413533834586,0.9295105322830889,10,2021-02-01 11:27:43.048502, +5,dpr,elasticsearch,10000,5637,288.06125728895495,19.568754413737462,0.05110187285594375,0.9749866950505588,0.8987097014904354,10,2021-02-01 11:37:21.149887, +9,dpr,elasticsearch,100000,5637,1225.6274364620313,4.5992769354707805,0.2174254810115365,0.9579563597658329,0.8654564090434241,10,2021-02-01 12:15:52.757320, +13,dpr,elasticsearch,500000,5637,5339.008413678017,1.0558140319761546,0.947136493467805,0.9308142629058009,0.8086137228234089,10,2021-02-01 14:52:23.056230, +0,elastic,elasticsearch,1000,1064,4.046542626992959,262.9405144288997,0.0038031415667227064,0.8909774436090225,0.742044471297291,10,2021-02-01 11:26:04.346134, +4,elastic,elasticsearch,10000,5637,30.701430243001596,183.6070813438718,0.005446413028738974,0.8110697179350719,0.6620627317806674,10,2021-02-01 11:31:20.470092, +8,elastic,elasticsearch,100000,5637,34.705507238930295,162.42378943468643,0.006156733588598598,0.7193542664537875,0.5625959153101251,10,2021-02-01 11:50:36.048887, +12,elastic,elasticsearch,500000,5637,68.3838099470413,82.4317920333114,0.01213124178588634,0.6274614156466205,0.45594527090003406,10,2021-02-01 13:02:16.905187, +2,dpr,faiss_flat,1000,1064,30.053267333012627,35.40380445859966,0.028245552004711117,0.9915413533834586,0.9295105322830889,10,2021-02-01 11:28:55.544474, +6,dpr,faiss_flat,10000,5637,218.59436279792135,25.78749025294445,0.038778492602079356,0.9749866950505588,0.8987097014904354,10,2021-02-01 11:42:07.545869, +10,dpr,faiss_flat,100000,5637,865.7440476809643,6.511162294559943,0.15358241044544338,0.9579563597658329,0.8654606328368972,10,2021-02-01 12:34:29.493598, +14,dpr,faiss_flat,500000,5637,3717.9468668280497,1.5161593755666505,0.6595612678424783,0.9308142629058009,0.808613722823409,10,2021-02-01 16:12:52.804436, +3,dpr,faiss_hnsw,1000,1064,27.167708159968242,39.164142729117266,0.02553356030072203,0.9915413533834586,0.9295105322830889,10,2021-02-01 11:30:02.684535, +7,dpr,faiss_hnsw,10000,5637,167.55242089293097,33.6432023480111,0.02972368651639719,0.972503104488203,0.8969941373746582,10,2021-02-01 11:46:07.130588, +11,dpr,faiss_hnsw,100000,5637,167.48228578322596,33.65729082116796,0.029711244595214823,0.9402164271775767,0.8507984377595874,10,2021-02-01 12:43:21.697968, +15,dpr,faiss_hnsw,500000,5637,164.45566683610014,34.27671486454735,0.029174324434291313,0.8825616462657442,0.7691475821598232,10,2021-02-01 16:47:01.710072, diff --git a/test/benchmarks/retriever_query_results.md b/test/benchmarks/retriever_query_results.md new file mode 100644 index 000000000..42db185ec --- /dev/null +++ b/test/benchmarks/retriever_query_results.md @@ -0,0 +1,18 @@ +| | retriever | doc_store | n_docs | n_queries | retrieve_time | queries_per_second | seconds_per_query | recall | map | top_k | date_time | error | +|---:|:------------|:--------------|---------:|------------:|----------------:|---------------------:|--------------------:|---------:|---------:|--------:|:---------------------------|:--------| +| 1 | dpr | elasticsearch | 1000 | 1064 | 34.6755 | 30.6845 | 0.0325897 | 0.991541 | 0.929511 | 10 | 2021-02-01 11:27:43.048502 | | +| 5 | dpr | elasticsearch | 10000 | 5637 | 288.061 | 19.5688 | 0.0511019 | 0.974987 | 0.89871 | 10 | 2021-02-01 11:37:21.149887 | | +| 9 | dpr | elasticsearch | 100000 | 5637 | 1225.63 | 4.59928 | 0.217425 | 0.957956 | 0.865456 | 10 | 2021-02-01 12:15:52.757320 | | +| 13 | dpr | elasticsearch | 500000 | 5637 | 5339.01 | 1.05581 | 0.947136 | 0.930814 | 0.808614 | 10 | 2021-02-01 14:52:23.056230 | | +| 0 | elastic | elasticsearch | 1000 | 1064 | 4.04654 | 262.941 | 0.00380314 | 0.890977 | 0.742044 | 10 | 2021-02-01 11:26:04.346134 | | +| 4 | elastic | elasticsearch | 10000 | 5637 | 30.7014 | 183.607 | 0.00544641 | 0.81107 | 0.662063 | 10 | 2021-02-01 11:31:20.470092 | | +| 8 | elastic | elasticsearch | 100000 | 5637 | 34.7055 | 162.424 | 0.00615673 | 0.719354 | 0.562596 | 10 | 2021-02-01 11:50:36.048887 | | +| 12 | elastic | elasticsearch | 500000 | 5637 | 68.3838 | 82.4318 | 0.0121312 | 0.627461 | 0.455945 | 10 | 2021-02-01 13:02:16.905187 | | +| 2 | dpr | faiss_flat | 1000 | 1064 | 30.0533 | 35.4038 | 0.0282456 | 0.991541 | 0.929511 | 10 | 2021-02-01 11:28:55.544474 | | +| 6 | dpr | faiss_flat | 10000 | 5637 | 218.594 | 25.7875 | 0.0387785 | 0.974987 | 0.89871 | 10 | 2021-02-01 11:42:07.545869 | | +| 10 | dpr | faiss_flat | 100000 | 5637 | 865.744 | 6.51116 | 0.153582 | 0.957956 | 0.865461 | 10 | 2021-02-01 12:34:29.493598 | | +| 14 | dpr | faiss_flat | 500000 | 5637 | 3717.95 | 1.51616 | 0.659561 | 0.930814 | 0.808614 | 10 | 2021-02-01 16:12:52.804436 | | +| 3 | dpr | faiss_hnsw | 1000 | 1064 | 27.1677 | 39.1641 | 0.0255336 | 0.991541 | 0.929511 | 10 | 2021-02-01 11:30:02.684535 | | +| 7 | dpr | faiss_hnsw | 10000 | 5637 | 167.552 | 33.6432 | 0.0297237 | 0.972503 | 0.896994 | 10 | 2021-02-01 11:46:07.130588 | | +| 11 | dpr | faiss_hnsw | 100000 | 5637 | 167.482 | 33.6573 | 0.0297112 | 0.940216 | 0.850798 | 10 | 2021-02-01 12:43:21.697968 | | +| 15 | dpr | faiss_hnsw | 500000 | 5637 | 164.456 | 34.2767 | 0.0291743 | 0.882562 | 0.769148 | 10 | 2021-02-01 16:47:01.710072 | | \ No newline at end of file diff --git a/test/benchmarks/templates.py b/test/benchmarks/templates.py index c33e858d7..43eefa196 100644 --- a/test/benchmarks/templates.py +++ b/test/benchmarks/templates.py @@ -16,7 +16,7 @@ RETRIEVER_TEMPLATE = { "chart_type": "BarChart", "title": "Retriever Performance", "subtitle": "Time and Accuracy Benchmarks", - "description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. Indexing speed (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. Querying speed (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from here)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the \"facebook/dpr-question_encoder-single-nq-base\" and \"facebook/dpr-ctx_encoder-single-nq-base\" models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use n_links=128, efSearch=20 and efConstruction=80. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.", + "description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. Indexing speed (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. Querying speed (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from here)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the \"facebook/dpr-question_encoder-single-nq-base\" and \"facebook/dpr-ctx_encoder-single-nq-base\" models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use n_links=128, efSearch=20 and efConstruction=80. We use a cosine similarity function with BM25 retrievers, and dot product with DPR. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.", "bars": "horizontal", "columns": [ "Model", diff --git a/test/benchmarks/utils.py b/test/benchmarks/utils.py index 8baa2dbd1..2d9e40c91 100644 --- a/test/benchmarks/utils.py +++ b/test/benchmarks/utils.py @@ -22,7 +22,7 @@ reader_types = ["farm"] doc_index = "eval_document" label_index = "label" -def get_document_store(document_store_type, es_similarity='cosine'): +def get_document_store(document_store_type, similarity='dot_product'): """ TODO This method is taken from test/conftest.py but maybe should be within Haystack. Perhaps a class method of DocStore that just takes string for type of DocStore""" if document_store_type == "sql": @@ -35,7 +35,7 @@ def get_document_store(document_store_type, es_similarity='cosine'): # make sure we start from a fresh index client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) - document_store = ElasticsearchDocumentStore(index="eval_document", similarity=es_similarity, timeout=3000) + document_store = ElasticsearchDocumentStore(index="eval_document", similarity=similarity, timeout=3000) elif document_store_type in("faiss_flat", "faiss_hnsw"): if document_store_type == "faiss_flat": index_type = "Flat" @@ -48,12 +48,13 @@ def get_document_store(document_store_type, es_similarity='cosine'): status = subprocess.run( ['docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres'], shell=True) - time.sleep(3) + time.sleep(6) status = subprocess.run( ['docker exec -it haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"'], shell=True) time.sleep(1) document_store = FAISSDocumentStore(sql_url="postgresql://postgres:password@localhost:5432/haystack", - faiss_index_factory_str=index_type) + faiss_index_factory_str=index_type, + similarity=similarity) else: raise Exception(f"No document store fixture for '{document_store_type}'") diff --git a/test/test_eval.py b/test/test_eval.py index 95d781f21..528002da9 100644 --- a/test/test_eval.py +++ b/test/test_eval.py @@ -62,10 +62,10 @@ def test_eval_reader(reader, document_store: BaseDocumentStore): doc_index="haystack_test_eval_document", device="cpu", ) - assert reader_eval_results["f1"] > 0.65 - assert reader_eval_results["f1"] < 0.67 - assert reader_eval_results["EM"] == 0.5 - assert reader_eval_results["top_n_accuracy"] == 1.0 + assert reader_eval_results["f1"] > 66.65 + assert reader_eval_results["f1"] < 66.67 + assert reader_eval_results["EM"] == 50 + assert reader_eval_results["top_n_accuracy"] == 100.0 @pytest.mark.elasticsearch