diff --git a/docs/_src/benchmarks/reader_performance.json b/docs/_src/benchmarks/reader_performance.json
index 24209b35b..be935fe27 100644
--- a/docs/_src/benchmarks/reader_performance.json
+++ b/docs/_src/benchmarks/reader_performance.json
@@ -11,33 +11,33 @@
],
"data": [
{
- "F1": 82.62983412843887,
- "Speed": 98.86638639776464,
+ "F1": 82.58860575299658,
+ "Speed": 125.81040525892848,
"Model": "RoBERTa"
},
{
- "F1": 78.90026641413856,
- "Speed": 181.96379531485616,
+ "F1": 78.87858491007042,
+ "Speed": 260.6443097981493,
"Model": "MiniLM"
},
{
- "F1": 74.32668866064459,
- "Speed": 106.04748306200683,
+ "F1": 74.31182400443286,
+ "Speed": 121.08066567525722,
"Model": "BERT base"
},
{
- "F1": 83.29492827667042,
- "Speed": 40.408497243719076,
+ "F1": 83.26306774734308,
+ "Speed": 42.21949937744112,
"Model": "BERT large"
},
{
- "F1": 84.62174414643722,
- "Speed": 40.483264542292716,
+ "F1": 84.50422699207468,
+ "Speed": 42.07400844838985,
"Model": "XLM-RoBERTa"
},
{
- "F1": 42.342513261953935,
- "Speed": 160.41712955027901,
+ "F1": 42.31925844723574,
+ "Speed": 222.91207128366702,
"Model": "DistilBERT"
}
]
diff --git a/docs/_src/benchmarks/retriever_map.json b/docs/_src/benchmarks/retriever_map.json
index 8584e7065..65683484c 100644
--- a/docs/_src/benchmarks/retriever_map.json
+++ b/docs/_src/benchmarks/retriever_map.json
@@ -20,82 +20,82 @@
{
"model": "DPR / ElasticSearch",
"n_docs": 1000,
- "map": 0.929
+ "map": 92.95105322830888
},
{
"model": "DPR / ElasticSearch",
"n_docs": 10000,
- "map": 0.881
+ "map": 89.87097014904354
},
{
"model": "DPR / ElasticSearch",
"n_docs": 100000,
- "map": 0.821
+ "map": 86.54564090434241
},
{
"model": "DPR / ElasticSearch",
"n_docs": 500000,
- "map": 0.730
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 1000,
- "map": 0.929
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 10000,
- "map": 0.898
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 100000,
- "map": 0.863
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 500000,
- "map": 0.805
+ "map": 80.86137228234089
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 1000,
- "map": 0.748
+ "map": 74.20444712972909
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 10000,
- "map": 0.6609999999999999
+ "map": 66.20627317806674
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 100000,
- "map": 0.56
+ "map": 56.25959153101251
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 500000,
- "map": 0.452
+ "map": 45.59452709000341
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 1000,
+ "map": 92.95105322830888
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 10000,
+ "map": 89.87097014904354
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 100000,
+ "map": 86.54606328368972
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 500000,
+ "map": 80.8613722823409
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 1000,
- "map": 0.929
+ "map": 92.95105322830888
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 10000,
- "map": 0.896
+ "map": 89.69941373746582
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 100000,
- "map": 0.849
+ "map": 85.07984377595874
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 500000,
- "map": 0.766
+ "map": 76.91475821598232
}
]
}
\ No newline at end of file
diff --git a/docs/_src/benchmarks/retriever_performance.json b/docs/_src/benchmarks/retriever_performance.json
index f4985dade..d5075ca3c 100644
--- a/docs/_src/benchmarks/retriever_performance.json
+++ b/docs/_src/benchmarks/retriever_performance.json
@@ -2,7 +2,7 @@
"chart_type": "BarChart",
"title": "Retriever Performance",
"subtitle": "Time and Accuracy Benchmarks",
- "description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. Indexing speed (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. Querying speed (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from here)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the \"facebook/dpr-question_encoder-single-nq-base\" and \"facebook/dpr-ctx_encoder-single-nq-base\" models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use n_links=128, efSearch=20 and efConstruction=80. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.",
+ "description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. Indexing speed (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. Querying speed (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from here)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the \"facebook/dpr-question_encoder-single-nq-base\" and \"facebook/dpr-ctx_encoder-single-nq-base\" models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use n_links=128, efSearch=20 and efConstruction=80. We use a cosine similarity function with BM25 retrievers, and dot product with DPR. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.",
"bars": "horizontal",
"columns": [
"Model",
@@ -24,30 +24,30 @@
{
"model": "DPR / ElasticSearch",
"n_docs": 100000,
- "index_speed": 73.93635160290218,
- "query_speed": 6.23,
- "map": 82
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 100000,
- "index_speed": 104.77116699738369,
- "query_speed": 4.89,
- "map": 86.3
+ "index_speed": 69.75508852811794,
+ "query_speed": 4.5992769354707805,
+ "map": 86.54564090434241
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 100000,
- "index_speed": 484.32931514144724,
- "query_speed": 162.59,
- "map": 56
+ "index_speed": 482.9993330442806,
+ "query_speed": 162.42378943468643,
+ "map": 56.25959153101251
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 100000,
+ "index_speed": 95.52108545730724,
+ "query_speed": 6.511162294559942,
+ "map": 86.54606328368972
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 100000,
- "index_speed": 91.41086878008392,
- "query_speed": 12.85,
- "map": 84.9
+ "index_speed": 84.11829911061136,
+ "query_speed": 33.65729082116796,
+ "map": 85.07984377595874
}
]
}
\ No newline at end of file
diff --git a/docs/_src/benchmarks/retriever_speed.json b/docs/_src/benchmarks/retriever_speed.json
index 8c68cea84..e099421c7 100644
--- a/docs/_src/benchmarks/retriever_speed.json
+++ b/docs/_src/benchmarks/retriever_speed.json
@@ -20,82 +20,82 @@
{
"model": "DPR / ElasticSearch",
"n_docs": 1000,
- "query_speed": 40.802
+ "query_speed": 30.68451185154913
},
{
"model": "DPR / ElasticSearch",
"n_docs": 10000,
- "query_speed": 24.8
+ "query_speed": 19.568754413737462
},
{
"model": "DPR / ElasticSearch",
"n_docs": 100000,
- "query_speed": 6.23
+ "query_speed": 4.5992769354707805
},
{
"model": "DPR / ElasticSearch",
"n_docs": 500000,
- "query_speed": 1.45
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 1000,
- "query_speed": 40.048
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 10000,
- "query_speed": 22.47
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 100000,
- "query_speed": 4.90
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 500000,
- "query_speed": 1.08
+ "query_speed": 1.0558140319761546
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 1000,
- "query_speed": 232.97799999999998
+ "query_speed": 262.9405144288997
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 10000,
- "query_speed": 248.97
+ "query_speed": 183.6070813438718
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 100000,
- "query_speed": 162.59
+ "query_speed": 162.42378943468643
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 500000,
- "query_speed": 91.39
+ "query_speed": 82.43179203331141
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 1000,
+ "query_speed": 35.40380445859966
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 10000,
+ "query_speed": 25.78749025294445
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 100000,
+ "query_speed": 6.511162294559942
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 500000,
+ "query_speed": 1.5161593755666505
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 1000,
- "query_speed": 37.884
+ "query_speed": 39.16414272911727
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 10000,
- "query_speed": 31.34
+ "query_speed": 33.6432023480111
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 100000,
- "query_speed": 12.85
+ "query_speed": 33.65729082116796
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 500000,
- "query_speed": 3.32
+ "query_speed": 34.27671486454735
}
]
}
\ No newline at end of file
diff --git a/docs/v0.7.0/_src/benchmarks/reader_performance.json b/docs/v0.7.0/_src/benchmarks/reader_performance.json
index 24209b35b..a6264ed64 100644
--- a/docs/v0.7.0/_src/benchmarks/reader_performance.json
+++ b/docs/v0.7.0/_src/benchmarks/reader_performance.json
@@ -11,33 +11,33 @@
],
"data": [
{
- "F1": 82.62983412843887,
- "Speed": 98.86638639776464,
+ "F1": 82.58860575299658,
+ "Speed": 128.25544874114064,
"Model": "RoBERTa"
},
{
- "F1": 78.90026641413856,
- "Speed": 181.96379531485616,
+ "F1": 78.87858491007042,
+ "Speed": 269.33155450679567,
"Model": "MiniLM"
},
{
- "F1": 74.32668866064459,
- "Speed": 106.04748306200683,
+ "F1": 74.31182400443286,
+ "Speed": 123.82266420208393,
"Model": "BERT base"
},
{
- "F1": 83.29492827667042,
- "Speed": 40.408497243719076,
+ "F1": 83.26306774734308,
+ "Speed": 43.188105620245494,
"Model": "BERT large"
},
{
- "F1": 84.62174414643722,
- "Speed": 40.483264542292716,
+ "F1": 84.50422699207468,
+ "Speed": 42.956527893643,
"Model": "XLM-RoBERTa"
},
{
- "F1": 42.342513261953935,
- "Speed": 160.41712955027901,
+ "F1": 42.31925844723574,
+ "Speed": 226.281948654048,
"Model": "DistilBERT"
}
]
diff --git a/docs/v0.7.0/_src/benchmarks/retriever_map.json b/docs/v0.7.0/_src/benchmarks/retriever_map.json
index 8584e7065..8379f5fc1 100644
--- a/docs/v0.7.0/_src/benchmarks/retriever_map.json
+++ b/docs/v0.7.0/_src/benchmarks/retriever_map.json
@@ -17,85 +17,65 @@
}
],
"data": [
- {
- "model": "DPR / ElasticSearch",
- "n_docs": 1000,
- "map": 0.929
- },
{
"model": "DPR / ElasticSearch",
"n_docs": 10000,
- "map": 0.881
+ "map": 88.26183154948457
},
{
"model": "DPR / ElasticSearch",
"n_docs": 100000,
- "map": 0.821
+ "map": 82.47044752499787
},
{
"model": "DPR / ElasticSearch",
"n_docs": 500000,
- "map": 0.730
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 1000,
- "map": 0.929
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 10000,
- "map": 0.898
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 100000,
- "map": 0.863
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 500000,
- "map": 0.805
- },
- {
- "model": "BM25 / ElasticSearch",
- "n_docs": 1000,
- "map": 0.748
+ "map": 73.4952735751035
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 10000,
- "map": 0.6609999999999999
+ "map": 66.33019927857616
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 100000,
- "map": 0.56
+ "map": 56.25959153101251
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 500000,
- "map": 0.452
+ "map": 45.60339705629754
},
{
- "model": "DPR / FAISS (HSNW)",
- "n_docs": 1000,
- "map": 0.929
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 10000,
+ "map": 89.87097014904354
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 100000,
+ "map": 86.54606328368972
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 500000,
+ "map": 80.8613722823409
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 10000,
- "map": 0.896
+ "map": 89.69941373746582
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 100000,
- "map": 0.849
+ "map": 85.07984377595874
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 500000,
- "map": 0.766
+ "map": 76.91475821598232
}
]
}
\ No newline at end of file
diff --git a/docs/v0.7.0/_src/benchmarks/retriever_performance.json b/docs/v0.7.0/_src/benchmarks/retriever_performance.json
index f4985dade..08b6dcadf 100644
--- a/docs/v0.7.0/_src/benchmarks/retriever_performance.json
+++ b/docs/v0.7.0/_src/benchmarks/retriever_performance.json
@@ -24,30 +24,30 @@
{
"model": "DPR / ElasticSearch",
"n_docs": 100000,
- "index_speed": 73.93635160290218,
- "query_speed": 6.23,
- "map": 82
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 100000,
- "index_speed": 104.77116699738369,
- "query_speed": 4.89,
- "map": 86.3
+ "index_speed": 70.7842323095542,
+ "query_speed": 6.108417678791537,
+ "map": 82.47044752499787
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 100000,
- "index_speed": 484.32931514144724,
- "query_speed": 162.59,
- "map": 56
+ "index_speed": 486.8274411916137,
+ "query_speed": 162.40717155994315,
+ "map": 56.25959153101251
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 100000,
+ "index_speed": 98.76191471019415,
+ "query_speed": 6.614453113633132,
+ "map": 86.54606328368972
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 100000,
- "index_speed": 91.41086878008392,
- "query_speed": 12.85,
- "map": 84.9
+ "index_speed": 86.8695584169603,
+ "query_speed": 38.24323466239034,
+ "map": 85.07984377595874
}
]
}
\ No newline at end of file
diff --git a/docs/v0.7.0/_src/benchmarks/retriever_speed.json b/docs/v0.7.0/_src/benchmarks/retriever_speed.json
index 8c68cea84..0c599ceb8 100644
--- a/docs/v0.7.0/_src/benchmarks/retriever_speed.json
+++ b/docs/v0.7.0/_src/benchmarks/retriever_speed.json
@@ -17,85 +17,65 @@
}
],
"data": [
- {
- "model": "DPR / ElasticSearch",
- "n_docs": 1000,
- "query_speed": 40.802
- },
{
"model": "DPR / ElasticSearch",
"n_docs": 10000,
- "query_speed": 24.8
+ "query_speed": 23.3662850838307
},
{
"model": "DPR / ElasticSearch",
"n_docs": 100000,
- "query_speed": 6.23
+ "query_speed": 6.108417678791537
},
{
"model": "DPR / ElasticSearch",
"n_docs": 500000,
- "query_speed": 1.45
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 1000,
- "query_speed": 40.048
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 10000,
- "query_speed": 22.47
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 100000,
- "query_speed": 4.90
- },
- {
- "model": "DPR / FAISS (flat)",
- "n_docs": 500000,
- "query_speed": 1.08
- },
- {
- "model": "BM25 / ElasticSearch",
- "n_docs": 1000,
- "query_speed": 232.97799999999998
+ "query_speed": 1.4393100251286972
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 10000,
- "query_speed": 248.97
+ "query_speed": 244.5160214986071
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 100000,
- "query_speed": 162.59
+ "query_speed": 162.40717155994315
},
{
"model": "BM25 / ElasticSearch",
"n_docs": 500000,
- "query_speed": 91.39
+ "query_speed": 88.52692529827672
},
{
- "model": "DPR / FAISS (HSNW)",
- "n_docs": 1000,
- "query_speed": 37.884
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 10000,
+ "query_speed": 27.22085301792775
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 100000,
+ "query_speed": 6.614453113633132
+ },
+ {
+ "model": "DPR / FAISS (flat)",
+ "n_docs": 500000,
+ "query_speed": 1.5222363376940002
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 10000,
- "query_speed": 31.34
+ "query_speed": 39.903073511580295
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 100000,
- "query_speed": 12.85
+ "query_speed": 38.24323466239034
},
{
"model": "DPR / FAISS (HSNW)",
"n_docs": 500000,
- "query_speed": 3.32
+ "query_speed": 37.13917579922844
}
]
}
\ No newline at end of file
diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py
index 2fc0f6847..c8a6e665b 100644
--- a/haystack/reader/farm.py
+++ b/haystack/reader/farm.py
@@ -524,9 +524,9 @@ class FARMReader(BaseReader):
toc = perf_counter()
reader_time = toc - tic
results = {
- "EM": eval_results[0]["EM"],
- "f1": eval_results[0]["f1"],
- "top_n_accuracy": eval_results[0]["top_n_accuracy"],
+ "EM": eval_results[0]["EM"] * 100,
+ "f1": eval_results[0]["f1"] * 100,
+ "top_n_accuracy": eval_results[0]["top_n_accuracy"] * 100,
"top_n": self.inferencer.model.prediction_heads[0].n_best,
"reader_time": reader_time,
"seconds_per_query": reader_time / n_queries
diff --git a/haystack/retriever/dense.py b/haystack/retriever/dense.py
index e357859b8..fe5d207cc 100644
--- a/haystack/retriever/dense.py
+++ b/haystack/retriever/dense.py
@@ -187,8 +187,7 @@ class DensePassageRetriever(BaseRetriever):
:return: dictionary of embeddings for "passages" and "query"
"""
-
- dataset, tensor_names, problematic_ids, baskets = self.processor.dataset_from_dicts(
+ dataset, tensor_names, _, baskets = self.processor.dataset_from_dicts(
dicts, indices=[i for i in range(len(dicts))], return_baskets=True
)
diff --git a/test/benchmarks/config.json b/test/benchmarks/config.json
index c352fbcad..77d03eb4d 100644
--- a/test/benchmarks/config.json
+++ b/test/benchmarks/config.json
@@ -20,6 +20,7 @@
]
],
"n_docs_options": [
+ 1000,
10000,
100000,
500000
diff --git a/test/benchmarks/reader.py b/test/benchmarks/reader.py
index 41308a106..9ada5a748 100644
--- a/test/benchmarks/reader.py
+++ b/test/benchmarks/reader.py
@@ -1,5 +1,5 @@
from utils import get_document_store, index_to_doc_store, get_reader
-from haystack.preprocessor.utils import eval_data_from_file
+from haystack.preprocessor.utils import eval_data_from_json
from farm.data_handler.utils import _download_extract_downstream_data
from pathlib import Path
@@ -40,7 +40,7 @@ def benchmark_reader(ci=False, update_json=False, save_markdown=False, **kwargs)
doc_store = get_document_store("elasticsearch")
# download squad data
_download_extract_downstream_data(input_file=data_dir/filename)
- docs, labels = eval_data_from_file(data_dir/filename, max_docs=None)
+ docs, labels = eval_data_from_json(data_dir/filename, max_docs=None)
index_to_doc_store(doc_store, docs, None, labels)
for reader_name in reader_models:
diff --git a/test/benchmarks/reader_results.csv b/test/benchmarks/reader_results.csv
index 9b9411a83..9ef56e9bb 100644
--- a/test/benchmarks/reader_results.csv
+++ b/test/benchmarks/reader_results.csv
@@ -1,7 +1,7 @@
,EM,f1,top_n_accuracy,top_n,reader_time,seconds_per_query,passages_per_second,reader,error
-0,0.7836676217765043,0.8262983412843887,0.9742963087813922,5,124.91606550999859,0.01052722615118815,98.86638639776463,deepset/roberta-base-squad2,
-1,0.7439743805831789,0.7890026641413856,0.9720209000505646,5,67.87064415001078,0.005719757639475036,181.96379531485616,deepset/minilm-uncased-squad2,
-2,0.6947581324793528,0.7432668866064459,0.9557559413450194,5,116.45726653200109,0.009814365964267747,106.04748306200683,deepset/bert-base-cased-squad2,
-3,0.7900724759817968,0.8329492827667042,0.976908815101972,5,305.62878707199707,0.02575668187021718,40.40849724371908,deepset/bert-large-uncased-whole-word-masking-squad2,
-4,0.803472105174448,0.846217441464372,0.9742120343839542,5,305.06433064700104,0.025709112645120602,40.48326454229272,deepset/xlm-roberta-large-squad2,
-5,0.3730827574582842,0.42342513261953935,0.9539019046013821,5,76.98679084099422,0.006488015408814615,160.417129550279,distilbert-base-uncased-distilled-squad,
+0,0.7839204449688185,0.8258860575299658,0.9742120343839542,5,98.16358173700064,0.008272676701247315,125.81040525892847,deepset/roberta-base-squad2,
+1,0.7438058317883027,0.7887858491007042,0.9719366256531266,5,47.38258053499885,0.003993138423647299,260.6443097981493,deepset/minilm-uncased-squad2,
+2,0.6947581324793528,0.7431182400443286,0.9557559413450194,5,101.99811779300217,0.008595829916821352,121.08066567525722,deepset/bert-base-cased-squad2,
+3,0.7897353783920446,0.8326306774734308,0.976908815101972,5,292.51886408200517,0.024651851009776266,42.21949937744112,deepset/bert-large-uncased-whole-word-masking-squad2,
+4,0.8021237148154391,0.8450422699207468,0.974043485589078,5,293.53038741600176,0.024737096529243364,42.07400844838984,deepset/xlm-roberta-large-squad2,
+5,0.3729984830608461,0.4231925844723574,0.9539019046013821,5,55.403011280999635,0.004669055391960192,222.91207128366705,distilbert-base-uncased-distilled-squad,
diff --git a/test/benchmarks/retriever.py b/test/benchmarks/retriever.py
index f871dfaca..f71297b69 100644
--- a/test/benchmarks/retriever.py
+++ b/test/benchmarks/retriever.py
@@ -2,7 +2,7 @@ import pandas as pd
from pathlib import Path
from time import perf_counter
from utils import get_document_store, get_retriever, index_to_doc_store, load_config
-from haystack.preprocessor.utils import eval_data_from_file
+from haystack.preprocessor.utils import eval_data_from_json
from haystack.document_store.faiss import FAISSDocumentStore
from haystack import Document
@@ -134,7 +134,11 @@ def benchmark_querying(n_docs_options,
for retriever_name, doc_store_name in retriever_doc_stores:
try:
logger.info(f"##### Start querying run: {retriever_name}, {doc_store_name}, {n_docs} docs ##### ")
- doc_store = get_document_store(doc_store_name)
+ if retriever_name == "elastic":
+ similarity = "cosine"
+ else:
+ similarity = "dot_product"
+ doc_store = get_document_store(doc_store_name, similarity=similarity)
retriever = get_retriever(retriever_name, doc_store)
add_precomputed = retriever_name in ["dpr"]
# For DPR, precomputed embeddings are loaded from file
@@ -160,8 +164,8 @@ def benchmark_querying(n_docs_options,
"retrieve_time": raw_results["retrieve_time"],
"queries_per_second": raw_results["n_questions"] / raw_results["retrieve_time"],
"seconds_per_query": raw_results["retrieve_time"]/ raw_results["n_questions"],
- "recall": raw_results["recall"],
- "map": raw_results["map"],
+ "recall": raw_results["recall"] * 100,
+ "map": raw_results["map"] * 100,
"top_k": raw_results["top_k"],
"date_time": datetime.datetime.now(),
"error": None
@@ -265,7 +269,7 @@ def prepare_data(data_dir, filename_gold, filename_negative, data_s3_url, embed
download_from_s3(data_s3_url + str(embeddings_dir) + embedding_filename, cache_dir=data_dir)
logging.getLogger("farm").setLevel(logging.WARN)
- gold_docs, labels = eval_data_from_file(data_dir + filename_gold)
+ gold_docs, labels = eval_data_from_json(data_dir + filename_gold)
# Reduce number of docs
gold_docs = gold_docs[:n_docs]
diff --git a/test/benchmarks/retriever_index_results.csv b/test/benchmarks/retriever_index_results.csv
index 1a9253e30..b587e10b4 100644
--- a/test/benchmarks/retriever_index_results.csv
+++ b/test/benchmarks/retriever_index_results.csv
@@ -1,13 +1,17 @@
,retriever,doc_store,n_docs,indexing_time,docs_per_second,date_time,error
-1,dpr,elasticsearch,10000,135.8048727600035,73.63506033890373,2020-12-02 06:51:48.587178,
-5,dpr,elasticsearch,100000,1352.514667440999,73.93635160290218,2020-12-02 07:23:04.264694,
-9,dpr,elasticsearch,500000,6781.024389943996,73.7351720400064,2020-12-02 10:10:42.147031,
-0,elastic,elasticsearch,10000,20.694342684997537,483.223852635317,2020-12-02 06:49:00.317977,
-4,elastic,elasticsearch,100000,206.47108666299755,484.32931514144724,2020-12-02 06:59:54.055199,
-8,elastic,elasticsearch,500000,1032.1480222880054,484.4266415311529,2020-12-02 08:16:15.828533,
-2,dpr,faiss_flat,10000,95.10171413100034,105.15057579535569,2020-12-02 06:53:59.472952,
-6,dpr,faiss_flat,100000,954.4610684969957,104.77116699738367,2020-12-02 07:39:56.194345,
-10,dpr,faiss_flat,500000,4865.149988802004,102.77175444761984,2020-12-02 11:34:34.726687,
-3,dpr,faiss_hnsw,10000,103.25490099400486,96.84770314757859,2020-12-02 06:56:14.230579,
-7,dpr,faiss_hnsw,100000,1093.9618158599915,91.41086878008392,2020-12-02 07:58:43.508489,
-11,dpr,faiss_hnsw,500000,5784.850161597002,86.43266221816312,2020-12-02 13:11:43.328380,
+1,dpr,elasticsearch,1000,15.336494209999728,65.20394989279743,2021-01-29 11:18:25.436371,
+5,dpr,elasticsearch,10000,144.0823780490009,69.40474008972218,2021-01-29 11:23:19.896920,
+9,dpr,elasticsearch,100000,1433.587170629,69.75508852811794,2021-01-29 11:56:22.502185,
+13,dpr,elasticsearch,500000,7196.396471723998,69.47921810097519,2021-01-29 14:54:08.769187,
+0,elastic,elasticsearch,1000,2.1182381880007597,472.0904408506686,2021-01-29 11:17:14.160560,
+4,elastic,elasticsearch,10000,20.23965223199957,494.0796356268248,2021-01-29 11:20:27.378846,
+8,elastic,elasticsearch,100000,207.03962336699988,482.9993330442806,2021-01-29 11:31:50.829072,
+12,elastic,elasticsearch,500000,1029.1638562459993,485.8312862091863,2021-01-29 12:52:45.994426,
+2,dpr,faiss_flat,1000,9.899907313998483,101.01104669798258,2021-01-29 11:19:11.304749,
+6,dpr,faiss_flat,10000,104.1660261320012,96.00059032037764,2021-01-29 11:25:43.069491,
+10,dpr,faiss_flat,100000,1046.8892760299987,95.52108545730724,2021-01-29 12:14:51.105055,
+14,dpr,faiss_flat,500000,5243.775349973999,95.35114810028603,2021-01-29 16:24:19.855339,
+3,dpr,faiss_hnsw,1000,10.329135104999295,96.81352696374361,2021-01-29 11:19:55.337391,
+7,dpr,faiss_hnsw,10000,112.53792207699917,88.85893586304122,2021-01-29 11:28:10.284866,
+11,dpr,faiss_hnsw,100000,1188.8019736170008,84.11829911061136,2021-01-29 12:35:16.166263,
+15,dpr,faiss_hnsw,500000,6125.295488232001,81.62871504903015,2021-01-29 18:07:08.100722,
diff --git a/test/benchmarks/retriever_query_results.csv b/test/benchmarks/retriever_query_results.csv
index c5759ff1a..106ab88ee 100644
--- a/test/benchmarks/retriever_query_results.csv
+++ b/test/benchmarks/retriever_query_results.csv
@@ -1,13 +1,17 @@
,retriever,doc_store,n_docs,n_queries,retrieve_time,queries_per_second,seconds_per_query,recall,map,top_k,date_time,error
-1,dpr,elasticsearch,10000,5791,233.54168710688828,24.796429587106445,0.040328386652890395,0.9690899671904679,0.8808447974826822,10,2020-12-02 13:18:27.808539,
-5,dpr,elasticsearch,100000,5791,928.9148432369257,6.234155953220104,0.1604066384453334,0.9397340701087895,0.8212235461156204,10,2020-12-02 13:53:44.689757,
-9,dpr,elasticsearch,500000,5791,3992.798643678747,1.45036114184423,0.6894834473629333,0.8919012260404076,0.7302081363253893,10,2020-12-02 17:35:25.795083,
-0,elastic,elasticsearch,10000,5791,23.260322959773475,248.9647289083211,0.00401663321702184,0.8103954412018649,0.6609973604361457,10,2020-12-02 13:13:03.957613,
-4,elastic,elasticsearch,100000,5791,35.61682877641579,162.59167924109505,0.006150376234918976,0.7168019340355725,0.559593430418849,10,2020-12-02 13:33:30.417021,
-8,elastic,elasticsearch,500000,5791,63.36918604133825,91.38510941614904,0.010942701785760362,0.6238991538594371,0.45245893326535686,10,2020-12-02 16:08:13.070376,
-2,dpr,faiss_flat,10000,5791,257.67369354520633,22.474160712040344,0.044495543696288435,0.9746157831117251,0.8978985590667505,10,2020-12-02 13:23:51.002905,
-6,dpr,faiss_flat,100000,5791,1182.7107160334417,4.896379073508164,0.2042325532780939,0.9575202901053359,0.8630120493486063,10,2020-12-02 14:18:14.837806,
-3,dpr,faiss_hnsw,10000,5791,184.7552210999711,31.34417509568776,0.03190385444655001,0.972198238646175,0.8961883245210815,10,2020-12-02 13:28:33.415220,
-7,dpr,faiss_hnsw,100000,5791,450.7693457186833,12.84692505158515,0.0778396383558424,0.9399067518563288,0.8486882354392283,10,2020-12-02 15:10:44.114148,
-8,dpr,faiss_flat,500000,5791,5365.806154628852,1.0792413727067556,0.9265767837383616,0.9295458470039717,0.8045832613826054,10,2020-12-02 23:14:44.503864,
-9,dpr,faiss_hnsw,500000,5791,1745.922715222303,3.3168707580865915,0.30148898553312087,0.8820583664306683,0.765677378416975,10,2020-12-03 00:18:53.376265,
+1,dpr,elasticsearch,1000,1064,34.67547423102587,30.684511851549132,0.0325897314201371,0.9915413533834586,0.9295105322830889,10,2021-02-01 11:27:43.048502,
+5,dpr,elasticsearch,10000,5637,288.06125728895495,19.568754413737462,0.05110187285594375,0.9749866950505588,0.8987097014904354,10,2021-02-01 11:37:21.149887,
+9,dpr,elasticsearch,100000,5637,1225.6274364620313,4.5992769354707805,0.2174254810115365,0.9579563597658329,0.8654564090434241,10,2021-02-01 12:15:52.757320,
+13,dpr,elasticsearch,500000,5637,5339.008413678017,1.0558140319761546,0.947136493467805,0.9308142629058009,0.8086137228234089,10,2021-02-01 14:52:23.056230,
+0,elastic,elasticsearch,1000,1064,4.046542626992959,262.9405144288997,0.0038031415667227064,0.8909774436090225,0.742044471297291,10,2021-02-01 11:26:04.346134,
+4,elastic,elasticsearch,10000,5637,30.701430243001596,183.6070813438718,0.005446413028738974,0.8110697179350719,0.6620627317806674,10,2021-02-01 11:31:20.470092,
+8,elastic,elasticsearch,100000,5637,34.705507238930295,162.42378943468643,0.006156733588598598,0.7193542664537875,0.5625959153101251,10,2021-02-01 11:50:36.048887,
+12,elastic,elasticsearch,500000,5637,68.3838099470413,82.4317920333114,0.01213124178588634,0.6274614156466205,0.45594527090003406,10,2021-02-01 13:02:16.905187,
+2,dpr,faiss_flat,1000,1064,30.053267333012627,35.40380445859966,0.028245552004711117,0.9915413533834586,0.9295105322830889,10,2021-02-01 11:28:55.544474,
+6,dpr,faiss_flat,10000,5637,218.59436279792135,25.78749025294445,0.038778492602079356,0.9749866950505588,0.8987097014904354,10,2021-02-01 11:42:07.545869,
+10,dpr,faiss_flat,100000,5637,865.7440476809643,6.511162294559943,0.15358241044544338,0.9579563597658329,0.8654606328368972,10,2021-02-01 12:34:29.493598,
+14,dpr,faiss_flat,500000,5637,3717.9468668280497,1.5161593755666505,0.6595612678424783,0.9308142629058009,0.808613722823409,10,2021-02-01 16:12:52.804436,
+3,dpr,faiss_hnsw,1000,1064,27.167708159968242,39.164142729117266,0.02553356030072203,0.9915413533834586,0.9295105322830889,10,2021-02-01 11:30:02.684535,
+7,dpr,faiss_hnsw,10000,5637,167.55242089293097,33.6432023480111,0.02972368651639719,0.972503104488203,0.8969941373746582,10,2021-02-01 11:46:07.130588,
+11,dpr,faiss_hnsw,100000,5637,167.48228578322596,33.65729082116796,0.029711244595214823,0.9402164271775767,0.8507984377595874,10,2021-02-01 12:43:21.697968,
+15,dpr,faiss_hnsw,500000,5637,164.45566683610014,34.27671486454735,0.029174324434291313,0.8825616462657442,0.7691475821598232,10,2021-02-01 16:47:01.710072,
diff --git a/test/benchmarks/retriever_query_results.md b/test/benchmarks/retriever_query_results.md
new file mode 100644
index 000000000..42db185ec
--- /dev/null
+++ b/test/benchmarks/retriever_query_results.md
@@ -0,0 +1,18 @@
+| | retriever | doc_store | n_docs | n_queries | retrieve_time | queries_per_second | seconds_per_query | recall | map | top_k | date_time | error |
+|---:|:------------|:--------------|---------:|------------:|----------------:|---------------------:|--------------------:|---------:|---------:|--------:|:---------------------------|:--------|
+| 1 | dpr | elasticsearch | 1000 | 1064 | 34.6755 | 30.6845 | 0.0325897 | 0.991541 | 0.929511 | 10 | 2021-02-01 11:27:43.048502 | |
+| 5 | dpr | elasticsearch | 10000 | 5637 | 288.061 | 19.5688 | 0.0511019 | 0.974987 | 0.89871 | 10 | 2021-02-01 11:37:21.149887 | |
+| 9 | dpr | elasticsearch | 100000 | 5637 | 1225.63 | 4.59928 | 0.217425 | 0.957956 | 0.865456 | 10 | 2021-02-01 12:15:52.757320 | |
+| 13 | dpr | elasticsearch | 500000 | 5637 | 5339.01 | 1.05581 | 0.947136 | 0.930814 | 0.808614 | 10 | 2021-02-01 14:52:23.056230 | |
+| 0 | elastic | elasticsearch | 1000 | 1064 | 4.04654 | 262.941 | 0.00380314 | 0.890977 | 0.742044 | 10 | 2021-02-01 11:26:04.346134 | |
+| 4 | elastic | elasticsearch | 10000 | 5637 | 30.7014 | 183.607 | 0.00544641 | 0.81107 | 0.662063 | 10 | 2021-02-01 11:31:20.470092 | |
+| 8 | elastic | elasticsearch | 100000 | 5637 | 34.7055 | 162.424 | 0.00615673 | 0.719354 | 0.562596 | 10 | 2021-02-01 11:50:36.048887 | |
+| 12 | elastic | elasticsearch | 500000 | 5637 | 68.3838 | 82.4318 | 0.0121312 | 0.627461 | 0.455945 | 10 | 2021-02-01 13:02:16.905187 | |
+| 2 | dpr | faiss_flat | 1000 | 1064 | 30.0533 | 35.4038 | 0.0282456 | 0.991541 | 0.929511 | 10 | 2021-02-01 11:28:55.544474 | |
+| 6 | dpr | faiss_flat | 10000 | 5637 | 218.594 | 25.7875 | 0.0387785 | 0.974987 | 0.89871 | 10 | 2021-02-01 11:42:07.545869 | |
+| 10 | dpr | faiss_flat | 100000 | 5637 | 865.744 | 6.51116 | 0.153582 | 0.957956 | 0.865461 | 10 | 2021-02-01 12:34:29.493598 | |
+| 14 | dpr | faiss_flat | 500000 | 5637 | 3717.95 | 1.51616 | 0.659561 | 0.930814 | 0.808614 | 10 | 2021-02-01 16:12:52.804436 | |
+| 3 | dpr | faiss_hnsw | 1000 | 1064 | 27.1677 | 39.1641 | 0.0255336 | 0.991541 | 0.929511 | 10 | 2021-02-01 11:30:02.684535 | |
+| 7 | dpr | faiss_hnsw | 10000 | 5637 | 167.552 | 33.6432 | 0.0297237 | 0.972503 | 0.896994 | 10 | 2021-02-01 11:46:07.130588 | |
+| 11 | dpr | faiss_hnsw | 100000 | 5637 | 167.482 | 33.6573 | 0.0297112 | 0.940216 | 0.850798 | 10 | 2021-02-01 12:43:21.697968 | |
+| 15 | dpr | faiss_hnsw | 500000 | 5637 | 164.456 | 34.2767 | 0.0291743 | 0.882562 | 0.769148 | 10 | 2021-02-01 16:47:01.710072 | |
\ No newline at end of file
diff --git a/test/benchmarks/templates.py b/test/benchmarks/templates.py
index c33e858d7..43eefa196 100644
--- a/test/benchmarks/templates.py
+++ b/test/benchmarks/templates.py
@@ -16,7 +16,7 @@ RETRIEVER_TEMPLATE = {
"chart_type": "BarChart",
"title": "Retriever Performance",
"subtitle": "Time and Accuracy Benchmarks",
- "description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. Indexing speed (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. Querying speed (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from here)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the \"facebook/dpr-question_encoder-single-nq-base\" and \"facebook/dpr-ctx_encoder-single-nq-base\" models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use n_links=128, efSearch=20 and efConstruction=80. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.",
+ "description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. Indexing speed (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. Querying speed (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from here)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the \"facebook/dpr-question_encoder-single-nq-base\" and \"facebook/dpr-ctx_encoder-single-nq-base\" models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use n_links=128, efSearch=20 and efConstruction=80. We use a cosine similarity function with BM25 retrievers, and dot product with DPR. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.",
"bars": "horizontal",
"columns": [
"Model",
diff --git a/test/benchmarks/utils.py b/test/benchmarks/utils.py
index 8baa2dbd1..2d9e40c91 100644
--- a/test/benchmarks/utils.py
+++ b/test/benchmarks/utils.py
@@ -22,7 +22,7 @@ reader_types = ["farm"]
doc_index = "eval_document"
label_index = "label"
-def get_document_store(document_store_type, es_similarity='cosine'):
+def get_document_store(document_store_type, similarity='dot_product'):
""" TODO This method is taken from test/conftest.py but maybe should be within Haystack.
Perhaps a class method of DocStore that just takes string for type of DocStore"""
if document_store_type == "sql":
@@ -35,7 +35,7 @@ def get_document_store(document_store_type, es_similarity='cosine'):
# make sure we start from a fresh index
client = Elasticsearch()
client.indices.delete(index='haystack_test*', ignore=[404])
- document_store = ElasticsearchDocumentStore(index="eval_document", similarity=es_similarity, timeout=3000)
+ document_store = ElasticsearchDocumentStore(index="eval_document", similarity=similarity, timeout=3000)
elif document_store_type in("faiss_flat", "faiss_hnsw"):
if document_store_type == "faiss_flat":
index_type = "Flat"
@@ -48,12 +48,13 @@ def get_document_store(document_store_type, es_similarity='cosine'):
status = subprocess.run(
['docker run --name haystack-postgres -p 5432:5432 -e POSTGRES_PASSWORD=password -d postgres'],
shell=True)
- time.sleep(3)
+ time.sleep(6)
status = subprocess.run(
['docker exec -it haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"'], shell=True)
time.sleep(1)
document_store = FAISSDocumentStore(sql_url="postgresql://postgres:password@localhost:5432/haystack",
- faiss_index_factory_str=index_type)
+ faiss_index_factory_str=index_type,
+ similarity=similarity)
else:
raise Exception(f"No document store fixture for '{document_store_type}'")
diff --git a/test/test_eval.py b/test/test_eval.py
index 95d781f21..528002da9 100644
--- a/test/test_eval.py
+++ b/test/test_eval.py
@@ -62,10 +62,10 @@ def test_eval_reader(reader, document_store: BaseDocumentStore):
doc_index="haystack_test_eval_document",
device="cpu",
)
- assert reader_eval_results["f1"] > 0.65
- assert reader_eval_results["f1"] < 0.67
- assert reader_eval_results["EM"] == 0.5
- assert reader_eval_results["top_n_accuracy"] == 1.0
+ assert reader_eval_results["f1"] > 66.65
+ assert reader_eval_results["f1"] < 66.67
+ assert reader_eval_results["EM"] == 50
+ assert reader_eval_results["top_n_accuracy"] == 100.0
@pytest.mark.elasticsearch