diff --git a/.gitignore b/.gitignore index cef05a51e..23f8a2693 100644 --- a/.gitignore +++ b/.gitignore @@ -149,6 +149,7 @@ tutorials/cache tutorials/mlruns tutorials/model models +saved_models *_build .DS_Store diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 4d954b073..6c9d1d757 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -827,8 +827,9 @@ the vector embeddings are indexed in a FAISS Index. Recommended options: - "Flat" (default): Best accuracy (= exact). Becomes slow and RAM intense for > 1 Mio docs. - "HNSW": Graph-based heuristic. If not further specified, - we use a RAM intense, but more accurate config: - HNSW256, efConstruction=256 and efSearch=256 + we use the following config: + HNSW64, efConstruction=80 and efSearch=20 + - "IVFx,Flat": Inverted Index. Replace x with the number of centroids aka nlist. Rule of thumb: nlist = 10 * sqrt (num_docs) is a good starting point. For more details see: diff --git a/docs/_src/benchmarks/retriever_map.json b/docs/_src/benchmarks/retriever_map.json index 65683484c..346cccbb5 100644 --- a/docs/_src/benchmarks/retriever_map.json +++ b/docs/_src/benchmarks/retriever_map.json @@ -8,7 +8,10 @@ "BM25 / ElasticSearch", "DPR / ElasticSearch", "DPR / FAISS (flat)", - "DPR / FAISS (HSNW)" + "DPR / FAISS (HNSW)", + "DPR / Milvus (flat)", + "DPR / Milvus (HNSW)" + ], "axis": [ { @@ -17,25 +20,25 @@ } ], "data": [ - { - "model": "DPR / ElasticSearch", - "n_docs": 1000, - "map": 92.95105322830888 - }, { "model": "DPR / ElasticSearch", "n_docs": 10000, - "map": 89.87097014904354 + "map": 89.87097014904356 }, { - "model": "DPR / ElasticSearch", + "model": "BM25 / ElasticSearch", "n_docs": 100000, - "map": 86.54564090434241 + "map": 56.259591531012504 + }, + { + "model": "BM25 / ElasticSearch", + "n_docs": 10000, + "map": 66.33019927857616 }, { "model": "DPR / ElasticSearch", - "n_docs": 500000, - "map": 80.86137228234089 + "n_docs": 1000, + "map": 92.95105322830891 }, { "model": "BM25 / ElasticSearch", @@ -43,24 +46,29 @@ "map": 74.20444712972909 }, { - "model": "BM25 / ElasticSearch", - "n_docs": 10000, - "map": 66.20627317806674 - }, - { - "model": "BM25 / ElasticSearch", + "model": "DPR / ElasticSearch", "n_docs": 100000, - "map": 56.25959153101251 + "map": 86.54606328368973 }, { "model": "BM25 / ElasticSearch", "n_docs": 500000, - "map": 45.59452709000341 + "map": 45.60339705629754 + }, + { + "model": "DPR / ElasticSearch", + "n_docs": 500000, + "map": 80.86137228234091 }, { "model": "DPR / FAISS (flat)", "n_docs": 1000, - "map": 92.95105322830888 + "map": 92.95105322830891 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 500000, + "map": 80.86137228234091 }, { "model": "DPR / FAISS (flat)", @@ -70,32 +78,67 @@ { "model": "DPR / FAISS (flat)", "n_docs": 100000, - "map": 86.54606328368972 + "map": 86.54606328368973 }, { - "model": "DPR / FAISS (flat)", - "n_docs": 500000, - "map": 80.8613722823409 - }, - { - "model": "DPR / FAISS (HSNW)", - "n_docs": 1000, - "map": 92.95105322830888 - }, - { - "model": "DPR / FAISS (HSNW)", + "model": "DPR / FAISS (HNSW)", "n_docs": 10000, - "map": 89.69941373746582 + "map": 89.49563682134192 }, { - "model": "DPR / FAISS (HSNW)", + "model": "DPR / FAISS (HNSW)", "n_docs": 100000, - "map": 85.07984377595874 + "map": 84.33419639513305 }, { - "model": "DPR / FAISS (HSNW)", + "model": "DPR / FAISS (HNSW)", "n_docs": 500000, - "map": 76.91475821598232 + "map": 75.73315903145605 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 10000, + "map": 89.87097014904354 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 100000, + "map": 86.54606328368973 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 500000, + "map": 80.86137228234091 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 10000, + "map": 89.87097014904354 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 500000, + "map": 74.85616575291942 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 100000, + "map": 86.54606328368973 } ] } \ No newline at end of file diff --git a/docs/_src/benchmarks/retriever_performance.json b/docs/_src/benchmarks/retriever_performance.json index 07f8a3b41..58e8352f2 100644 --- a/docs/_src/benchmarks/retriever_performance.json +++ b/docs/_src/benchmarks/retriever_performance.json @@ -21,33 +21,47 @@ "time_label": "seconds" }, "data": [ - { - "model": "DPR / ElasticSearch", - "n_docs": 100000, - "index_speed": 69.75508852811794, - "query_speed": 4.5992769354707805, - "map": 86.54564090434241 - }, { "model": "BM25 / ElasticSearch", "n_docs": 100000, - "index_speed": 482.9993330442806, - "query_speed": 162.42378943468643, - "map": 56.25959153101251 + "index_speed": 485.5602670200369, + "query_speed": 165.51512861040828, + "map": 56.259591531012504 + }, + { + "model": "DPR / ElasticSearch", + "n_docs": 100000, + "index_speed": 71.36964873196698, + "query_speed": 5.355677072083696, + "map": 86.54606328368973 }, { "model": "DPR / FAISS (flat)", "n_docs": 100000, - "index_speed": 95.52108545730724, - "query_speed": 6.511162294559942, - "map": 86.54606328368972 + "index_speed": 100.01184910084558, + "query_speed": 6.624479268751268, + "map": 86.54606328368973 }, { - "model": "DPR / FAISS (HSNW)", + "model": "DPR / FAISS (HNSW)", "n_docs": 100000, - "index_speed": 84.11829911061136, - "query_speed": 33.65729082116796, - "map": 85.07984377595874 + "index_speed": 89.90389306648805, + "query_speed": 40.68196225525062, + "map": 84.33419639513305 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 100000, + "index_speed": 116.00982709720004, + "query_speed": 28.30393009791128, + "map": 86.54606328368973 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 100000, + "index_speed": 115.61076852516383, + "query_speed": 28.076443272229284, + "map": 86.54606328368973 } ] } \ No newline at end of file diff --git a/docs/_src/benchmarks/retriever_speed.json b/docs/_src/benchmarks/retriever_speed.json index e099421c7..2fa8c987d 100644 --- a/docs/_src/benchmarks/retriever_speed.json +++ b/docs/_src/benchmarks/retriever_speed.json @@ -8,7 +8,9 @@ "BM25 / ElasticSearch", "DPR / ElasticSearch", "DPR / FAISS (flat)", - "DPR / FAISS (HSNW)" + "DPR / FAISS (HNSW)", + "DPR / Milvus (flat)", + "DPR / Milvus (HNSW)" ], "axis": [ { @@ -19,83 +21,123 @@ "data": [ { "model": "DPR / ElasticSearch", - "n_docs": 1000, - "query_speed": 30.68451185154913 + "n_docs": 10000, + "query_speed": 22.92376153263135 + }, + { + "model": "BM25 / ElasticSearch", + "n_docs": 100000, + "query_speed": 165.51512861040828 + }, + { + "model": "BM25 / ElasticSearch", + "n_docs": 10000, + "query_speed": 260.2575025618042 }, { "model": "DPR / ElasticSearch", - "n_docs": 10000, - "query_speed": 19.568754413737462 + "n_docs": 1000, + "query_speed": 34.64504401787953 + }, + { + "model": "BM25 / ElasticSearch", + "n_docs": 1000, + "query_speed": 357.3482189096959 }, { "model": "DPR / ElasticSearch", "n_docs": 100000, - "query_speed": 4.5992769354707805 + "query_speed": 5.355677072083696 + }, + { + "model": "BM25 / ElasticSearch", + "n_docs": 500000, + "query_speed": 90.8126875640674 }, { "model": "DPR / ElasticSearch", "n_docs": 500000, - "query_speed": 1.0558140319761546 - }, - { - "model": "BM25 / ElasticSearch", - "n_docs": 1000, - "query_speed": 262.9405144288997 - }, - { - "model": "BM25 / ElasticSearch", - "n_docs": 10000, - "query_speed": 183.6070813438718 - }, - { - "model": "BM25 / ElasticSearch", - "n_docs": 100000, - "query_speed": 162.42378943468643 - }, - { - "model": "BM25 / ElasticSearch", - "n_docs": 500000, - "query_speed": 82.43179203331141 + "query_speed": 1.2322912620168611 }, { "model": "DPR / FAISS (flat)", "n_docs": 1000, - "query_speed": 35.40380445859966 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 10000, - "query_speed": 25.78749025294445 - }, - { - "model": "DPR / FAISS (flat)", - "n_docs": 100000, - "query_speed": 6.511162294559942 + "query_speed": 40.40867245729847 }, { "model": "DPR / FAISS (flat)", "n_docs": 500000, - "query_speed": 1.5161593755666505 + "query_speed": 1.5414031869280982 }, { - "model": "DPR / FAISS (HSNW)", + "model": "DPR / FAISS (flat)", + "n_docs": 10000, + "query_speed": 28.034963597300674 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "query_speed": 6.624479268751268 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 10000, + "query_speed": 42.12400556900196 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 100000, + "query_speed": 40.68196225525062 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 500000, + "query_speed": 39.42722374998517 + }, + { + "model": "DPR / FAISS (HNSW)", "n_docs": 1000, - "query_speed": 39.16414272911727 + "query_speed": 44.31880791041422 }, { - "model": "DPR / FAISS (HSNW)", + "model": "DPR / Milvus (flat)", + "n_docs": 1000, + "query_speed": 40.48084417170779 + }, + { + "model": "DPR / Milvus (flat)", "n_docs": 10000, - "query_speed": 33.6432023480111 + "query_speed": 38.132788329389 }, { - "model": "DPR / FAISS (HSNW)", + "model": "DPR / Milvus (flat)", "n_docs": 100000, - "query_speed": 33.65729082116796 + "query_speed": 28.30393009791128 }, { - "model": "DPR / FAISS (HSNW)", + "model": "DPR / Milvus (flat)", "n_docs": 500000, - "query_speed": 34.27671486454735 + "query_speed": 15.30425741318099 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 1000, + "query_speed": 40.38894718145225 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 10000, + "query_speed": 38.15488156137084 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 500000, + "query_speed": 24.503220592922823 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 100000, + "query_speed": 28.076443272229284 } ] } \ No newline at end of file diff --git a/haystack/document_store/faiss.py b/haystack/document_store/faiss.py index 271d0bc75..39e070f11 100644 --- a/haystack/document_store/faiss.py +++ b/haystack/document_store/faiss.py @@ -51,8 +51,8 @@ class FAISSDocumentStore(SQLDocumentStore): Recommended options: - "Flat" (default): Best accuracy (= exact). Becomes slow and RAM intense for > 1 Mio docs. - "HNSW": Graph-based heuristic. If not further specified, - we use a RAM intense, but more accurate config: - HNSW256, efConstruction=256 and efSearch=256 + we use the following config: + HNSW64, efConstruction=80 and efSearch=20 - "IVFx,Flat": Inverted Index. Replace x with the number of centroids aka nlist. Rule of thumb: nlist = 10 * sqrt (num_docs) is a good starting point. For more details see: @@ -103,7 +103,7 @@ class FAISSDocumentStore(SQLDocumentStore): if index_factory == "HNSW" and metric_type == faiss.METRIC_INNER_PRODUCT: # faiss index factory doesn't give the same results for HNSW IP, therefore direct init. # defaults here are similar to DPR codebase (good accuracy, but very high RAM consumption) - n_links = kwargs.get("n_links", 128) + n_links = kwargs.get("n_links", 64) index = faiss.IndexHNSWFlat(vector_dim, n_links, metric_type) index.hnsw.efSearch = kwargs.get("efSearch", 20)#20 index.hnsw.efConstruction = kwargs.get("efConstruction", 80)#80 diff --git a/haystack/document_store/milvus.py b/haystack/document_store/milvus.py index f14f7fa93..d89b986b0 100644 --- a/haystack/document_store/milvus.py +++ b/haystack/document_store/milvus.py @@ -99,7 +99,8 @@ class MilvusDocumentStore(SQLDocumentStore): self.index_file_size = index_file_size if similarity == "dot_product": - self.metric_type = MetricType.L2 + self.metric_type = MetricType.IP + self.similarity = similarity else: raise ValueError("The Milvus document store can currently only support dot_product similarity. " "Please set similarity=\"dot_product\"") diff --git a/haystack/document_store/sql.py b/haystack/document_store/sql.py index df1871c66..ab2fabd91 100644 --- a/haystack/document_store/sql.py +++ b/haystack/document_store/sql.py @@ -124,27 +124,19 @@ class SQLDocumentStore(BaseDocumentStore): return documents - def get_documents_by_vector_ids( - self, - vector_ids: List[str], - index: Optional[str] = None, - batch_size: int = 10_000 - ): - """ - Fetch documents by specifying a list of text vector id strings + def get_documents_by_vector_ids(self, vector_ids: List[str], index: Optional[str] = None, batch_size: int = 10_000): + """Fetch documents by specifying a list of text vector id strings""" + index = index or self.index - :param vector_ids: List of vector_id strings. - :param index: Name of the index to get the documents from. If None, the - DocumentStore's default index (self.index) will be used. - :param batch_size: When working with large number of documents, batching can help reduce memory footprint. - """ + documents = [] + for i in range(0, len(vector_ids), batch_size): + query = self.session.query(DocumentORM).filter( + DocumentORM.vector_id.in_(vector_ids[i: i + batch_size]), + DocumentORM.index == index + ) + for row in query.all(): + documents.append(self._convert_sql_row_to_document(row)) - result = self._query( - index=index, - vector_ids=vector_ids, - batch_size=batch_size - ) - documents = list(result) sorted_documents = sorted(documents, key=lambda doc: vector_ids.index(doc.meta["vector_id"])) return sorted_documents diff --git a/test/benchmarks/config.json b/test/benchmarks/config.json index 71b9b174b..80e47a1bc 100644 --- a/test/benchmarks/config.json +++ b/test/benchmarks/config.json @@ -2,6 +2,14 @@ "params": { "full": { "retriever_doc_stores": [ + [ + "dpr", + "milvus_flat" + ], + [ + "dpr", + "milvus_hnsw" + ], [ "elastic", "elasticsearch" diff --git a/test/benchmarks/results_to_json.py b/test/benchmarks/results_to_json.py index cbd7fe549..03dac1a9f 100644 --- a/test/benchmarks/results_to_json.py +++ b/test/benchmarks/results_to_json.py @@ -44,7 +44,9 @@ def retriever(index_csv="retriever_index_results.csv", query_csv="retriever_quer "elasticsearch": "ElasticSearch", "faiss": "FAISS", "faiss_flat": "FAISS (flat)", - "faiss_hnsw": "FAISS (HSNW)" + "faiss_hnsw": "FAISS (HNSW)", + "milvus_flat": "Milvus (flat)", + "milvus_hnsw": "Milvus (HNSW)" } index = pd.read_csv(index_csv) diff --git a/test/benchmarks/retriever_index_results.csv b/test/benchmarks/retriever_index_results.csv index b587e10b4..4b7f83b1e 100644 --- a/test/benchmarks/retriever_index_results.csv +++ b/test/benchmarks/retriever_index_results.csv @@ -1,17 +1,25 @@ ,retriever,doc_store,n_docs,indexing_time,docs_per_second,date_time,error -1,dpr,elasticsearch,1000,15.336494209999728,65.20394989279743,2021-01-29 11:18:25.436371, -5,dpr,elasticsearch,10000,144.0823780490009,69.40474008972218,2021-01-29 11:23:19.896920, -9,dpr,elasticsearch,100000,1433.587170629,69.75508852811794,2021-01-29 11:56:22.502185, -13,dpr,elasticsearch,500000,7196.396471723998,69.47921810097519,2021-01-29 14:54:08.769187, -0,elastic,elasticsearch,1000,2.1182381880007597,472.0904408506686,2021-01-29 11:17:14.160560, -4,elastic,elasticsearch,10000,20.23965223199957,494.0796356268248,2021-01-29 11:20:27.378846, -8,elastic,elasticsearch,100000,207.03962336699988,482.9993330442806,2021-01-29 11:31:50.829072, -12,elastic,elasticsearch,500000,1029.1638562459993,485.8312862091863,2021-01-29 12:52:45.994426, -2,dpr,faiss_flat,1000,9.899907313998483,101.01104669798258,2021-01-29 11:19:11.304749, -6,dpr,faiss_flat,10000,104.1660261320012,96.00059032037764,2021-01-29 11:25:43.069491, -10,dpr,faiss_flat,100000,1046.8892760299987,95.52108545730724,2021-01-29 12:14:51.105055, -14,dpr,faiss_flat,500000,5243.775349973999,95.35114810028603,2021-01-29 16:24:19.855339, -3,dpr,faiss_hnsw,1000,10.329135104999295,96.81352696374361,2021-01-29 11:19:55.337391, -7,dpr,faiss_hnsw,10000,112.53792207699917,88.85893586304122,2021-01-29 11:28:10.284866, -11,dpr,faiss_hnsw,100000,1188.8019736170008,84.11829911061136,2021-01-29 12:35:16.166263, -15,dpr,faiss_hnsw,500000,6125.295488232001,81.62871504903015,2021-01-29 18:07:08.100722, +9,dpr,elasticsearch,10000,139.7465313429998,71.55812673057035,2021-04-12 13:06:34.024778, +14,elastic,elasticsearch,100000,205.94765839000047,485.56026702003703,2021-04-12 13:44:31.464961, +8,elastic,elasticsearch,10000,19.96974077699997,500.7576268349683,2021-04-12 13:03:44.944941, +3,dpr,elasticsearch,1000,14.592372578999857,68.52895199777984,2021-04-12 12:58:01.128834, +2,elastic,elasticsearch,1000,2.1051091760000418,475.034744706267,2021-04-12 12:57:18.604681, +15,dpr,elasticsearch,100000,1401.1558383250003,71.36964873196699,2021-04-12 14:08:31.400192, +20,elastic,elasticsearch,500000,1027.416534557,486.6575368242339,2021-04-12 17:30:22.080196, +21,dpr,elasticsearch,500000,7010.269106937998,71.32393812174124,2021-04-12 19:28:39.657070, +4,dpr,faiss_flat,1000,9.570316116999948,104.48975642755202,2021-04-12 12:58:47.918981, +22,dpr,faiss_flat,500000,5041.962777018001,99.16772933728758,2021-04-12 20:55:28.443354, +10,dpr,faiss_flat,10000,95.71089355200002,104.48131481049198,2021-04-12 13:08:50.343175, +16,dpr,faiss_flat,100000,999.8815230299997,100.0118491008456,2021-04-12 14:26:14.495997, +11,dpr,faiss_hnsw,10000,108.9302881550002,91.80183188142033,2021-04-12 13:11:13.117266, +17,dpr,faiss_hnsw,100000,1112.2988848330006,89.90389306648807,2021-04-12 14:45:22.644624, +23,dpr,faiss_hnsw,500000,5802.5877488399965,86.16845132586847,2021-04-12 22:32:53.095579, +5,dpr,faiss_hnsw,1000,9.837438108000242,101.65248197970928,2021-04-12 12:59:30.777696, +0,dpr,milvus_flat,1000,9.717840198999966,102.90352377917338,2021-04-12 12:56:32.363797, +6,dpr,milvus_flat,10000,87.06480573199997,114.85697252666792,2021-04-12 13:01:21.834327, +12,dpr,milvus_flat,100000,861.995940363,116.00982709720004,2021-04-12 13:26:00.742197, +18,dpr,milvus_flat,500000,4364.3841063849995,114.56370195934652,2021-04-12 15:58:40.069278, +1,dpr,milvus_hnsw,1000,8.522245804999784,117.33996212750934,2021-04-12 12:57:04.976604, +7,dpr,milvus_hnsw,10000,87.128293364,114.77327988306308,2021-04-12 13:03:13.381764, +19,dpr,milvus_hnsw,500000,4414.051032668,113.27463056035022,2021-04-12 17:12:50.943619, +13,dpr,milvus_hnsw,100000,864.9713281529998,115.61076852516385,2021-04-12 13:40:51.875517, diff --git a/test/benchmarks/retriever_query_results.csv b/test/benchmarks/retriever_query_results.csv index 106ab88ee..a06bc5adb 100644 --- a/test/benchmarks/retriever_query_results.csv +++ b/test/benchmarks/retriever_query_results.csv @@ -1,17 +1,27 @@ ,retriever,doc_store,n_docs,n_queries,retrieve_time,queries_per_second,seconds_per_query,recall,map,top_k,date_time,error -1,dpr,elasticsearch,1000,1064,34.67547423102587,30.684511851549132,0.0325897314201371,0.9915413533834586,0.9295105322830889,10,2021-02-01 11:27:43.048502, -5,dpr,elasticsearch,10000,5637,288.06125728895495,19.568754413737462,0.05110187285594375,0.9749866950505588,0.8987097014904354,10,2021-02-01 11:37:21.149887, -9,dpr,elasticsearch,100000,5637,1225.6274364620313,4.5992769354707805,0.2174254810115365,0.9579563597658329,0.8654564090434241,10,2021-02-01 12:15:52.757320, -13,dpr,elasticsearch,500000,5637,5339.008413678017,1.0558140319761546,0.947136493467805,0.9308142629058009,0.8086137228234089,10,2021-02-01 14:52:23.056230, -0,elastic,elasticsearch,1000,1064,4.046542626992959,262.9405144288997,0.0038031415667227064,0.8909774436090225,0.742044471297291,10,2021-02-01 11:26:04.346134, -4,elastic,elasticsearch,10000,5637,30.701430243001596,183.6070813438718,0.005446413028738974,0.8110697179350719,0.6620627317806674,10,2021-02-01 11:31:20.470092, -8,elastic,elasticsearch,100000,5637,34.705507238930295,162.42378943468643,0.006156733588598598,0.7193542664537875,0.5625959153101251,10,2021-02-01 11:50:36.048887, -12,elastic,elasticsearch,500000,5637,68.3838099470413,82.4317920333114,0.01213124178588634,0.6274614156466205,0.45594527090003406,10,2021-02-01 13:02:16.905187, -2,dpr,faiss_flat,1000,1064,30.053267333012627,35.40380445859966,0.028245552004711117,0.9915413533834586,0.9295105322830889,10,2021-02-01 11:28:55.544474, -6,dpr,faiss_flat,10000,5637,218.59436279792135,25.78749025294445,0.038778492602079356,0.9749866950505588,0.8987097014904354,10,2021-02-01 11:42:07.545869, -10,dpr,faiss_flat,100000,5637,865.7440476809643,6.511162294559943,0.15358241044544338,0.9579563597658329,0.8654606328368972,10,2021-02-01 12:34:29.493598, -14,dpr,faiss_flat,500000,5637,3717.9468668280497,1.5161593755666505,0.6595612678424783,0.9308142629058009,0.808613722823409,10,2021-02-01 16:12:52.804436, -3,dpr,faiss_hnsw,1000,1064,27.167708159968242,39.164142729117266,0.02553356030072203,0.9915413533834586,0.9295105322830889,10,2021-02-01 11:30:02.684535, -7,dpr,faiss_hnsw,10000,5637,167.55242089293097,33.6432023480111,0.02972368651639719,0.972503104488203,0.8969941373746582,10,2021-02-01 11:46:07.130588, -11,dpr,faiss_hnsw,100000,5637,167.48228578322596,33.65729082116796,0.029711244595214823,0.9402164271775767,0.8507984377595874,10,2021-02-01 12:43:21.697968, -15,dpr,faiss_hnsw,500000,5637,164.45566683610014,34.27671486454735,0.029174324434291313,0.8825616462657442,0.7691475821598232,10,2021-02-01 16:47:01.710072, +9,dpr,elasticsearch,10000,5637,245.9020519811238,22.92376153263135,0.043622858254589995,97.49866950505589,89.87097014904357,10,2021-04-12 22:50:49.396735, +14,elastic,elasticsearch,100000,5637,34.05730972948368,165.51512861040828,0.006041743787384013,71.93542664537875,56.25959153101251,10,2021-04-12 23:13:45.908089, +8,elastic,elasticsearch,10000,5637,21.6593179620686,260.2575025618042,0.0038423484055470285,81.1424516586837,66.33019927857616,10,2021-04-12 22:45:31.522940, +3,dpr,elasticsearch,1000,1064,30.71146336113452,34.645044017879535,0.02886415729429936,99.15413533834587,92.9510532283089,10,2021-04-12 22:36:03.760187, +2,elastic,elasticsearch,1000,1064,2.977487905904127,357.3482189096959,0.002798390889007638,89.09774436090225,74.2044471297291,10,2021-04-12 22:34:57.937655, +15,dpr,elasticsearch,100000,5637,1052.5279855618428,5.3556770720836955,0.18671775511120148,95.7956359765833,86.54606328368972,10,2021-04-12 23:36:05.648808, +20,elastic,elasticsearch,500000,5637,62.07282430688065,90.81268756406739,0.011011677187667313,62.746141564662054,45.60339705629754,10,2021-04-13 00:49:21.340846, +21,dpr,elasticsearch,500000,5637,4574.405559586667,1.2322912620168613,0.8114964625841169,93.0814262905801,80.8613722823409,10,2021-04-13 02:26:34.009405, +4,dpr,faiss_flat,1000,1064,26.330981328930648,40.408672457298465,0.024747162903130306,99.15413533834587,92.9510532283089,10,2021-04-12 22:37:12.563041, +22,dpr,faiss_flat,500000,5637,3657.0574446742394,1.5414031869280982,0.6487595253990136,93.0814262905801,80.86137228234091,10,2021-04-13 03:45:25.432560, +10,dpr,faiss_flat,10000,5637,201.07035204221756,28.03496359730067,0.03566974490725875,97.49866950505589,89.87097014904354,10,2021-04-12 22:55:15.966314, +16,dpr,faiss_flat,100000,5637,850.93480880688,6.6244792687512675,0.150955261452347,95.7956359765833,86.54606328368972,10,2021-04-12 23:54:18.954839, +11,dpr,faiss_hnsw,10000,5637,133.81918276423676,42.12400556900196,0.023739432812530912,96.96647152740819,89.49563682134192,10,2021-04-12 22:58:37.011857, +17,dpr,faiss_hnsw,100000,5637,138.56263777621643,40.68196225525062,0.024580918534010367,93.18786588610963,84.33419639513305,10,2021-04-13 00:02:06.239176, +23,dpr,faiss_hnsw,500000,5637,142.9722781331293,39.42722374998517,0.025363185760711247,86.90792974986695,75.73315903145605,10,2021-04-13 04:15:55.931738, +5,dpr,faiss_hnsw,1000,1064,24.00786596405669,44.31880791041422,0.022563783800805162,99.15413533834587,92.9510532283089,10,2021-04-12 22:38:15.165568, +0,dpr,milvus_flat,1000,1064,26.284036851771816,40.480844171707794,0.02470304215392088,99.15413533834587,92.9510532283089,10,2021-04-12 22:33:51.172786, +6,dpr,milvus_flat,10000,5637,147.82553930512222,38.132788329389,0.0262241510209548,97.49866950505589,89.87097014904354,10,2021-04-12 22:41:19.833104, +12,dpr,milvus_flat,100000,5637,199.15962131407287,28.30393009791128,0.03533078256414278,95.7956359765833,86.54606328368972,10,2021-04-12 23:03:55.360165, +18,dpr,milvus_flat,500000,5637,368.32888050779,15.30425741318099,0.06534129510516055,93.0814262905801,80.8613722823409,10,2021-04-13 00:17:05.346842, +1,dpr,milvus_hnsw,1000,1064,26.34384093301196,40.38894718145225,0.02475924899719169,99.15413533834587,92.9510532283089,10,2021-04-12 22:34:43.389192, +7,dpr,milvus_hnsw,10000,5637,147.73994229108212,38.154881561370836,0.02620896616836653,97.49866950505589,89.87097014904354,10,2021-04-12 22:44:25.082029, +19,dpr,milvus_hnsw,500000,5637,230.05139175982913,24.50322059292282,0.04081096181653879,85.71935426645379,74.85616575291942,10,2021-04-13 00:30:29.654851, +13,dpr,milvus_hnsw,100000,5637,200.77329401532916,28.076443272229284,0.035617047013540744,95.7956359765833,86.54606328368972,10,2021-04-12 23:09:18.273909, + + diff --git a/test/benchmarks/utils.py b/test/benchmarks/utils.py index c4ff05ea3..0468f102c 100644 --- a/test/benchmarks/utils.py +++ b/test/benchmarks/utils.py @@ -3,6 +3,7 @@ from haystack.document_store.sql import SQLDocumentStore from haystack.document_store.memory import InMemoryDocumentStore from haystack.document_store.elasticsearch import Elasticsearch, ElasticsearchDocumentStore from haystack.document_store.faiss import FAISSDocumentStore +from haystack.document_store.milvus import MilvusDocumentStore, IndexType from haystack.retriever.sparse import ElasticsearchRetriever, TfidfRetriever from haystack.retriever.dense import DensePassageRetriever, EmbeddingRetriever from haystack.reader.farm import FARMReader @@ -39,6 +40,16 @@ def get_document_store(document_store_type, similarity='dot_product'): client = Elasticsearch() client.indices.delete(index='haystack_test*', ignore=[404]) document_store = ElasticsearchDocumentStore(index="eval_document", similarity=similarity, timeout=3000) + elif document_store_type in ("milvus_flat", "milvus_hnsw"): + if document_store_type == "milvus_flat": + index_type = IndexType.FLAT + index_param = None + search_param = None + elif document_store_type == "milvus_hnsw": + index_type = IndexType.HNSW + index_param = {"M": 64, "efConstruction": 80} + search_param = {"ef": 20} + document_store = MilvusDocumentStore(similarity=similarity, index_type=index_type, index_param=index_param, search_param=search_param) assert document_store.get_document_count(index="eval_document") == 0 elif document_store_type in("faiss_flat", "faiss_hnsw"): if document_store_type == "faiss_flat": @@ -54,7 +65,7 @@ def get_document_store(document_store_type, similarity='dot_product'): shell=True) time.sleep(6) status = subprocess.run( - ['docker exec -it haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"'], shell=True) + ['docker exec haystack-postgres psql -U postgres -c "CREATE DATABASE haystack;"'], shell=True) time.sleep(1) document_store = FAISSDocumentStore(sql_url="postgresql://postgres:password@localhost:5432/haystack", faiss_index_factory_str=index_type,