Implement OpenSearch ANN (#1225)

* Simplify ODES init

* Add arguments to ES init and create script

* Rename similarity_fn_name and add util fn

* Create OpenSearchDocumentStore

* Specify params of Open Search HNSW

* Add better argument handling

* Update opensearch index mapping

* Edit opensearch default port

* Fix HNSW mapping

* Force small HNSW params

* Implement auto start and stopping of document store services

* Fix starting and stopping of ds service

* Restore HNSW params

* Add opensearch query benchmarks

* Add write wait time

* Revert wait time

* Add timeout

* Update benchmarks

* Update benchmarks

* Update benchmarks json

* Update documentation

* Update documentation

* Fix similarity name

* Improve argument passing

* Improve stopping and starting of service
This commit is contained in:
Branden Chan 2021-07-26 10:52:52 +02:00 committed by GitHub
parent 4c2a0b914a
commit 363be65a78
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 314 additions and 33 deletions

View File

@ -159,6 +159,46 @@
"model": "BM25 / Elasticsearch",
"n_docs": 1000,
"map": 74.20444712972909
},
{
"model": "DPR / OpenSearch (flat)",
"n_docs": 1000,
"map": 92.95105322830891
},
{
"model": "DPR / OpenSearch (flat)",
"n_docs": 10000,
"map": 89.8709701490436
},
{
"model": "DPR / OpenSearch (flat)",
"n_docs": 100000,
"map": 86.54014997282701
},
{
"model": "DPR / OpenSearch (HNSW)",
"n_docs": 1000,
"map": 92.76308330349686
},
{
"model": "DPR / OpenSearch (HNSW)",
"n_docs": 10000,
"map": 89.00403653862938
},
{
"model": "DPR / OpenSearch (HNSW)",
"n_docs": 100000,
"map": 85.7342431384476
},
{
"model": "DPR / OpenSearch (flat)",
"n_docs": 500000,
"map": 80.85588135082547
},
{
"model": "DPR / OpenSearch (HNSW)",
"n_docs": 500000,
"map": 77.5426462347698
}
]
}

View File

@ -69,6 +69,20 @@
"index_speed": 115.61076852516383,
"query_speed": 38.80526238789059,
"map": 81.63864883662649
},
{
"model": "DPR / OpenSearch (flat)",
"n_docs": 100000,
"index_speed": 70.05381128388427,
"query_speed": 15.306895223372484,
"map": 86.54014997282701
},
{
"model": "DPR / OpenSearch (HNSW)",
"n_docs": 100000,
"index_speed": 70.31004397719536,
"query_speed": 24.95733865947408,
"map": 85.7342431384476
}
]
}

View File

@ -159,6 +159,46 @@
"model": "BM25 / Elasticsearch",
"n_docs": 1000,
"query_speed": 282.95914917837337
},
{
"model": "DPR / OpenSearch (flat)",
"n_docs": 1000,
"query_speed": 29.061163356184426
},
{
"model": "DPR / OpenSearch (flat)",
"n_docs": 10000,
"query_speed": 24.834414667596725
},
{
"model": "DPR / OpenSearch (flat)",
"n_docs": 100000,
"query_speed": 15.306895223372484
},
{
"model": "DPR / OpenSearch (HNSW)",
"n_docs": 1000,
"query_speed": 29.10621389658101
},
{
"model": "DPR / OpenSearch (HNSW)",
"n_docs": 10000,
"query_speed": 26.92417300437131
},
{
"model": "DPR / OpenSearch (HNSW)",
"n_docs": 100000,
"query_speed": 24.95733865947408
},
{
"model": "DPR / OpenSearch (flat)",
"n_docs": 500000,
"query_speed": 11.33271222977541
},
{
"model": "DPR / OpenSearch (HNSW)",
"n_docs": 500000,
"query_speed": 24.13921492357397
}
]
}

View File

@ -47,9 +47,9 @@ from haystack.document_store import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore()
```
Note that we also support [Open Distro for Elasticsearch](https://opendistro.github.io/for-elasticsearch-docs/).
Follow [their documentation](https://opendistro.github.io/for-elasticsearch-docs/docs/install/)
to run it and connect to it using Haystack's `OpenDistroElasticsearchDocumentStore` class.
Note that we also support [OpenSearch](https://opensearch.org/).
Follow [their documentation](https://opensearch.org/docs/)
to run it and connect to it using Haystack's `OpenSearchDocumentStore` class.
We further support [AWS Elastic Search Service](https://aws.amazon.com/elasticsearch-service/) with [signed Requests](https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html):
Use e.g. [aws-requests-auth](https://github.com/davidmuller/aws-requests-auth) to create an auth object and pass it as `aws4auth` to the `ElasticsearchDocumentStore` constructor.
@ -143,6 +143,32 @@ document_store = WeaviateDocumentStore()
</div>
</div>
<div class="tab">
<input type="radio" id="tab-1-1" name="tab-group-1" checked>
<label class="labelouter" for="tab-1-1">OpenSearch</label>
<div class="tabcontent">
See the official [OpenSearch documentation](https://opensearch.org/docs/opensearch/install/docker/) on how to install and start an instance.
If you have Docker set up, we recommend pulling the Docker image and running it.
```bash
docker pull opensearchproject/opensearch:1.0.0
docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.0.0
```
Note that we also have a utility function `haystack.utils.launch_opensearch` that can start up an OpenSearch instance.
Next you can initialize the Haystack object that will connect to this instance.
```python
from haystack.document_store import OpenSearchDocumentStore
document_store = OpenSearchDocumentStore()
```
</div>
</div>
</div>
Each DocumentStore constructor allows for arguments specifying how to connect to existing databases and the names of indexes.
@ -303,6 +329,21 @@ The Document Stores have different characteristics. You should choose one depend
- Less options for ANN algorithms than FAISS or Milvus
- No BM25 / Tf-idf retrieval
</div>
</div>
<div class="tab">
<input type="radio" id="tab-2-6" name="tab-group-2">
<label class="labelouter" for="tab-2-6">OpenSearch</label>
<div class="tabcontent">
**Pros:**
- Fully open source fork of Elasticsearch
- Has support for Approximate Nearest Neighbours vector search
**Cons:**
- It's ANN algorithms seem a little less performant that FAISS or Milvus in our benchmarks
</div>
</div>

View File

@ -1,4 +1,4 @@
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore, OpenDistroElasticsearchDocumentStore
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore, OpenDistroElasticsearchDocumentStore, OpenSearchDocumentStore
from haystack.document_store.faiss import FAISSDocumentStore
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.document_store.milvus import MilvusDocumentStore

View File

@ -48,6 +48,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
timeout=30,
return_embedding: bool = False,
duplicate_documents: str = 'overwrite',
index_type: str = "flat"
):
"""
A DocumentStore using Elasticsearch to store and query the documents for our search.
@ -95,6 +96,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
overwrite: Update any existing documents with the same ID when adding documents.
fail: an error is raised if the document ID of the document being added already
exists.
:param index_type: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the
ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does.
"""
# save init parameters to enable export of component config as YAML
@ -105,7 +108,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
custom_mapping=custom_mapping, excluded_meta_data=excluded_meta_data, analyzer=analyzer, scheme=scheme,
ca_certs=ca_certs, verify_certs=verify_certs, create_index=create_index,
duplicate_documents=duplicate_documents, refresh_type=refresh_type, similarity=similarity,
timeout=timeout, return_embedding=return_embedding,
timeout=timeout, return_embedding=return_embedding, index_type=index_type
)
self.client = self._init_elastic_client(host=host, port=port, username=username, password=password,
@ -131,10 +134,17 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
self.custom_mapping = custom_mapping
self.index: str = index
self.label_index: str = label_index
if similarity in ["cosine", "dot_product"]:
if similarity in ["cosine", "dot_product", "l2"]:
self.similarity = similarity
else:
raise Exception("Invalid value for similarity in ElasticSearchDocumentStore constructor. Choose between 'cosine' and 'dot_product'")
raise Exception(f"Invalid value {similarity} for similarity in ElasticSearchDocumentStore constructor. Choose between 'cosine', 'l2' and 'dot_product'")
if index_type in ["flat", "hnsw"]:
self.index_type = index_type
else:
raise Exception("Invalid value for index_type in constructor. Choose between 'flat' and 'hnsw'")
if index_type == "hnsw" and type(self) == ElasticsearchDocumentStore:
raise Exception("The HNSW algorithm for approximate nearest neighbours calculation is currently not available in the ElasticSearchDocumentStore. "
"Try the OpenSearchDocumentStore instead.")
if create_index:
self._create_document_index(index)
self._create_label_index(label_index)
@ -142,6 +152,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
self.duplicate_documents = duplicate_documents
self.refresh_type = refresh_type
def _init_elastic_client(self,
host: Union[str, List[str]],
port: Union[int, List[int]],
@ -356,7 +367,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
return buckets
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None,
batch_size: int = 10_000,duplicate_documents: Optional[str] = None):
batch_size: int = 10_000, duplicate_documents: Optional[str] = None):
"""
Indexes documents for later queries in Elasticsearch.
@ -805,7 +816,10 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
if self.similarity == "cosine":
similarity_fn_name = "cosineSimilarity"
elif self.similarity == "dot_product":
similarity_fn_name = "dotProduct"
if type(self) == OpenSearchDocumentStore:
similarity_fn_name = "innerproduct"
elif type(self) == ElasticsearchDocumentStore:
similarity_fn_name = "dotProduct"
else:
raise Exception("Invalid value for similarity in ElasticSearchDocumentStore constructor. Choose between \'cosine\' and \'dot_product\'")
@ -997,14 +1011,34 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
if self.refresh_type == "wait_for":
time.sleep(2)
class OpenDistroElasticsearchDocumentStore(ElasticsearchDocumentStore):
class OpenSearchDocumentStore(ElasticsearchDocumentStore):
"""
Document Store using the Open Distro for Elasticsearch. It is compatible with the AWS Elasticsearch Service.
Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service.
In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using
the KNN plugin that can scale to a large number of documents.
"""
def __init__(self,
verify_certs=False,
scheme="https",
username="admin",
password="admin",
port=9201,
**kwargs):
# Overwrite default kwarg values of parent class so that in default cases we can initialize
# an OpenSearchDocumentStore without provding any arguments
super(OpenSearchDocumentStore, self).__init__(verify_certs=verify_certs,
scheme=scheme,
username=username,
password=password,
port=port,
**kwargs)
def _create_document_index(self, index_name: str):
"""
Create a new index for storing documents.
@ -1038,21 +1072,40 @@ class OpenDistroElasticsearchDocumentStore(ElasticsearchDocumentStore):
}
}
if self.embedding_field:
if self.similarity == "cosine":
similarity_space_type = "cosinesimil"
elif self.similarity == "dot_product":
similarity_space_type = "innerproduct"
elif self.similarity == "l2":
similarity_space_type = "l2"
else:
raise Exception(
f"Similarity function {self.similarity} is not supported by OpenDistroElasticsearchDocumentStore."
)
mapping["settings"]["knn"] = True
mapping["settings"]["knn.space_type"] = similarity_space_type
mapping["settings"]["index"] = {}
mapping["settings"]["index"]["knn"] = True
mapping["settings"]["index"]["knn.space_type"] = similarity_space_type
mapping["mappings"]["properties"][self.embedding_field] = {
"type": "knn_vector",
"dimension": self.embedding_dim,
}
if self.index_type == "flat":
pass
elif self.index_type == "hnsw":
mapping["settings"]["index"]["knn.algo_param"] = {}
mapping["settings"]["index"]["knn.algo_param"]["ef_search"] = 20
mapping["mappings"]["properties"][self.embedding_field]["method"] = {
"space_type": similarity_space_type,
"name": "hnsw",
"engine": "nmslib",
"parameters": {
"ef_construction": 80,
"m": 64
}
}
else:
logger.error("Please set index_type to either 'flat' or 'hnsw'")
try:
self.client.indices.create(index=index_name, body=mapping)
except RequestError as e:
@ -1063,6 +1116,7 @@ class OpenDistroElasticsearchDocumentStore(ElasticsearchDocumentStore):
if not self.client.indices.exists(index=index_name):
raise e
def _get_vector_similarity_query(self, query_emb: np.ndarray, top_k: int):
"""
Generate Elasticsearch query for vector similarity.
@ -1072,3 +1126,10 @@ class OpenDistroElasticsearchDocumentStore(ElasticsearchDocumentStore):
def _scale_embedding_score(self, score):
return score
class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore):
def __init__(self):
logger.warning("Open Distro for Elasticsearch has been replaced by OpenSearch! "
"See https://opensearch.org/faq/ for details. "
"We recommend using the OpenSearchDocumentStore instead.")

View File

@ -118,7 +118,7 @@ class MilvusDocumentStore(SQLDocumentStore):
self.similarity = similarity
else:
raise ValueError("The Milvus document store can currently only support dot_product and L2 similarity. "
"Please set similarity=\"dot_product\"")
"Please set similarity=\"dot_product\" or \"l2\"")
self.index_type = index_type
self.index_param = index_param or {"nlist": 16384}

View File

@ -14,10 +14,8 @@ import torch
logger = logging.getLogger(__name__)
def launch_es():
# Start an Elasticsearch server
# You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in
# your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.
def launch_es(sleep=15):
# Start an Elasticsearch server via Docker
logger.info("Starting Elasticsearch ...")
status = subprocess.run(
@ -27,13 +25,59 @@ def launch_es():
logger.warning("Tried to start Elasticsearch through Docker but this failed. "
"It is likely that there is already an existing Elasticsearch instance running. ")
else:
time.sleep(15)
time.sleep(sleep)
def launch_open_distro_es(sleep=15):
# Start an Open Distro for Elasticsearch server via Docker
logger.info("Starting Open Distro for Elasticsearch ...")
status = subprocess.run(
['docker run -d -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" amazon/opendistro-for-elasticsearch:1.13.2'], shell=True
)
if status.returncode:
logger.warning("Tried to start Open Distro for Elasticsearch through Docker but this failed. "
"It is likely that there is already an existing Elasticsearch instance running. ")
else:
time.sleep(sleep)
def launch_opensearch(sleep=15):
# Start an OpenSearch server via docker
logger.info("Starting OpenSearch...")
# This line is needed since it is not possible to start a new docker container with the name opensearch if there is a stopped image with the same now
# docker rm only succeeds if the container is stopped, not if it is running
_ = subprocess.run(['docker rm opensearch'], shell=True, stdout=subprocess.DEVNULL)
status = subprocess.run(
['docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" --name opensearch opensearchproject/opensearch:1.0.0-rc1'],
shell=True
)
if status.returncode:
logger.warning("Tried to start OpenSearch through Docker but this failed. "
"It is likely that there is already an existing OpenSearch instance running. ")
else:
time.sleep(sleep)
def launch_milvus():
# Start a Milvus server
# You can start Milvus on your local machine instance using Docker. If Docker is not readily available in
# your environment (eg., in Colab notebooks)
def stop_opensearch():
logger.info("Stopping OpenSearch...")
status = subprocess.run(['docker stop opensearch'], shell=True)
if status.returncode:
logger.warning("Tried to stop OpenSearch but this failed. "
"It is likely that there was no OpenSearch Docker container with the name opensearch")
status = subprocess.run(['docker rm opensearch'], shell=True)
def stop_service(document_store):
ds_class = str(type(document_store))
if "OpenSearchDocumentStore" in ds_class:
stop_opensearch()
else:
logger.warning(f"No support yet for auto stopping the service behind a {ds_class}")
def launch_milvus(sleep=15):
# Start a Milvus server via docker
logger.info("Starting Milvus ...")
logger.warning("Automatic Milvus config creation not yet implemented. "
"If you are starting Milvus using launch_milvus(), "
@ -55,7 +99,7 @@ def launch_milvus():
logger.warning("Tried to start Milvus through Docker but this failed. "
"It is likely that there is already an existing Milvus instance running. ")
else:
time.sleep(15)
time.sleep(sleep)
def print_answers(results: dict, details: str = "all"):
@ -195,6 +239,7 @@ def get_batches_from_generator(iterable, n):
"""
Batch elements of an iterable into fixed-length chunks or blocks.
"""
# TODO consider moving to base.DocumentStore
it = iter(iterable)
x = tuple(islice(it, n))
while x:

View File

@ -6,6 +6,18 @@
"elastic",
"elasticsearch"
],
[
"elastic",
"opensearch_flat"
],
[
"dpr",
"opensearch_flat"
],
[
"dpr",
"opensearch_hnsw"
],
[
"dpr",
"elasticsearch"

View File

@ -47,7 +47,9 @@ def retriever(index_csv="retriever_index_results.csv", query_csv="retriever_quer
"faiss_hnsw": "FAISS (HNSW)",
"milvus_flat": "Milvus (flat)",
"milvus_hnsw": "Milvus (HNSW)",
"sentence_transformers": "Sentence Transformers"
"sentence_transformers": "Sentence Transformers",
"opensearch_flat": "OpenSearch (flat)",
"opensearch_hnsw": "OpenSearch (HNSW)"
}
index = pd.read_csv(index_csv)

View File

@ -19,6 +19,7 @@ from farm.file_utils import http_get
import json
from results_to_json import retriever as retriever_json
from templates import RETRIEVER_TEMPLATE, RETRIEVER_MAP_TEMPLATE, RETRIEVER_SPEED_TEMPLATE
from haystack.utils import stop_service
logger = logging.getLogger(__name__)
logging.getLogger("haystack.retriever.base").setLevel(logging.WARN)
@ -86,6 +87,7 @@ def benchmark_indexing(n_docs_options, retriever_doc_stores, data_dir, filename_
with open(md_file, "w") as f:
f.write(str(retriever_df.to_markdown()))
time.sleep(10)
stop_service(doc_store)
del doc_store
del retriever
@ -108,6 +110,7 @@ def benchmark_indexing(n_docs_options, retriever_doc_stores, data_dir, filename_
doc_store.delete_all_documents(index=doc_index)
doc_store.delete_all_documents(index=label_index)
time.sleep(10)
stop_service(doc_store)
del doc_store
del retriever
if update_json:
@ -126,6 +129,7 @@ def benchmark_querying(n_docs_options,
embeddings_dir,
update_json,
save_markdown,
wait_write_limit=100,
**kwargs):
""" Benchmark the time it takes to perform querying. Doc embeddings are loaded from file."""
retriever_results = []
@ -153,6 +157,7 @@ def benchmark_querying(n_docs_options,
add_precomputed=add_precomputed)
logger.info("Start indexing...")
index_to_doc_store(doc_store, docs, retriever, labels)
logger.info("Start queries...")
raw_results = retriever.eval()
@ -178,6 +183,7 @@ def benchmark_querying(n_docs_options,
doc_store.delete_all_documents(index=doc_index)
doc_store.delete_all_documents(index=label_index)
time.sleep(5)
stop_service(doc_store)
del doc_store
del retriever
except Exception:

View File

@ -26,4 +26,7 @@
0,sentence_transformers,elasticsearch,1000,10.380210993000219,96.33715544648746,2021-06-02 08:49:29.922794,
1,sentence_transformers,elasticsearch,10000,82.89545158599958,120.63388049253265,2021-06-02 08:51:09.796056,
2,sentence_transformers,elasticsearch,100000,836.6144149759998,119.52937722555106,2021-06-02 09:05:26.454063,
3,sentence_transformers,elasticsearch,500000,4207.770141414,118.82778364694073,2021-06-02 10:16:20.514575,
3,sentence_transformers,elasticsearch,500000,4207.770141414,118.82778364694073,2021-06-02 10:16:20.514575,
1,dpr,opensearch_flat,100000,1427.47408267,70.05381128388427,2021-07-22 12:33:02.890691,
0,elastic,opensearch_flat,100000,207.3902409509992,482.18276588833,2021-07-22 12:08:18.041527,
2,dpr,opensearch_hnsw,100000,1422.2719023249992,70.31004397719536,2021-07-22 12:57:54.770107,

1 retriever doc_store n_docs indexing_time docs_per_second date_time error
26 0 sentence_transformers elasticsearch 1000 10.380210993000219 96.33715544648746 2021-06-02 08:49:29.922794
27 1 sentence_transformers elasticsearch 10000 82.89545158599958 120.63388049253265 2021-06-02 08:51:09.796056
28 2 sentence_transformers elasticsearch 100000 836.6144149759998 119.52937722555106 2021-06-02 09:05:26.454063
29 3 sentence_transformers elasticsearch 500000 4207.770141414 118.82778364694073 2021-06-02 10:16:20.514575
30 1 dpr opensearch_flat 100000 1427.47408267 70.05381128388427 2021-07-22 12:33:02.890691
31 0 elastic opensearch_flat 100000 207.3902409509992 482.18276588833 2021-07-22 12:08:18.041527
32 2 dpr opensearch_hnsw 100000 1422.2719023249992 70.31004397719536 2021-07-22 12:57:54.770107

View File

@ -27,3 +27,11 @@
3,dpr,milvus_hnsw,100000,5637,145.2638032350751,38.80526238789059,0.0257697007690394,90.01241795281177,81.63864883662649,10,2021-06-01 12:07:43.734963,
10,dpr,milvus_hnsw,500000,5637,151.7069119141779,37.15717318924075,0.02691270390530032,84.1759801312755,73.57986207906387,10,2021-06-01 16:30:55.573386,
0,elastic,elasticsearch,1000,1064,3.760260104999361,282.95914917837337,0.003534079046052031,89.09774436090225,74.2044471297291,10,2021-06-02 08:27:07.187435,
0,dpr,opensearch_flat,1000,1064,36.61243656901206,29.061163356184426,0.034410184745312086,99.15413533834587,92.9510532283089,10,2021-07-19 15:25:29.480947,
2,dpr,opensearch_flat,10000,5637,226.98340490202918,24.834414667596725,0.040266703016148514,97.49866950505589,89.87097014904359,10,2021-07-19 15:32:29.913507,
4,dpr,opensearch_flat,100000,5637,368.2654070430117,15.306895223372486,0.06533003495529745,95.77789604399504,86.54014997282702,10,2021-07-19 15:49:18.090106,
1,dpr,opensearch_hnsw,1000,1064,36.55576791198473,29.106213896581007,0.03435692472930896,98.96616541353383,92.76308330349686,10,2021-07-19 15:27:04.463343,
3,dpr,opensearch_hnsw,10000,5637,209.3657621010234,26.924173004371312,0.03714134505961032,96.41653361717225,89.00403653862938,10,2021-07-19 15:37:40.060081,
5,dpr,opensearch_hnsw,100000,5637,225.86542887897758,24.95733865947408,0.040068374823306295,94.8199396842292,85.7342431384476,10,2021-07-19 15:58:37.854825,
0,dpr,opensearch_flat,500000,5637,497.4096126070708,11.33271222977541,0.0882401299640005,93.06368635799184,80.85588135082547,10,2021-07-21 13:12:13.891855,
1,dpr,opensearch_hnsw,500000,5637,233.5204362630284,24.139214923573974,0.04142636797286294,88.96576193010468,77.54264623476982,10,2021-07-21 13:39:28.533054,
1 retriever doc_store n_docs n_queries retrieve_time queries_per_second seconds_per_query recall map top_k date_time error
27 3 dpr milvus_hnsw 100000 5637 145.2638032350751 38.80526238789059 0.0257697007690394 90.01241795281177 81.63864883662649 10 2021-06-01 12:07:43.734963
28 10 dpr milvus_hnsw 500000 5637 151.7069119141779 37.15717318924075 0.02691270390530032 84.1759801312755 73.57986207906387 10 2021-06-01 16:30:55.573386
29 0 elastic elasticsearch 1000 1064 3.760260104999361 282.95914917837337 0.003534079046052031 89.09774436090225 74.2044471297291 10 2021-06-02 08:27:07.187435
30 0 dpr opensearch_flat 1000 1064 36.61243656901206 29.061163356184426 0.034410184745312086 99.15413533834587 92.9510532283089 10 2021-07-19 15:25:29.480947
31 2 dpr opensearch_flat 10000 5637 226.98340490202918 24.834414667596725 0.040266703016148514 97.49866950505589 89.87097014904359 10 2021-07-19 15:32:29.913507
32 4 dpr opensearch_flat 100000 5637 368.2654070430117 15.306895223372486 0.06533003495529745 95.77789604399504 86.54014997282702 10 2021-07-19 15:49:18.090106
33 1 dpr opensearch_hnsw 1000 1064 36.55576791198473 29.106213896581007 0.03435692472930896 98.96616541353383 92.76308330349686 10 2021-07-19 15:27:04.463343
34 3 dpr opensearch_hnsw 10000 5637 209.3657621010234 26.924173004371312 0.03714134505961032 96.41653361717225 89.00403653862938 10 2021-07-19 15:37:40.060081
35 5 dpr opensearch_hnsw 100000 5637 225.86542887897758 24.95733865947408 0.040068374823306295 94.8199396842292 85.7342431384476 10 2021-07-19 15:58:37.854825
36 0 dpr opensearch_flat 500000 5637 497.4096126070708 11.33271222977541 0.0882401299640005 93.06368635799184 80.85588135082547 10 2021-07-21 13:12:13.891855
37 1 dpr opensearch_hnsw 500000 5637 233.5204362630284 24.139214923573974 0.04142636797286294 88.96576193010468 77.54264623476982 10 2021-07-21 13:39:28.533054

View File

@ -1,13 +1,14 @@
import os
from haystack.document_store.sql import SQLDocumentStore
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.document_store.elasticsearch import Elasticsearch, ElasticsearchDocumentStore
from haystack.document_store.elasticsearch import Elasticsearch, ElasticsearchDocumentStore, OpenSearchDocumentStore
from haystack.document_store.faiss import FAISSDocumentStore
from haystack.document_store.milvus import MilvusDocumentStore, IndexType
from haystack.retriever.sparse import ElasticsearchRetriever, TfidfRetriever
from haystack.retriever.dense import DensePassageRetriever, EmbeddingRetriever
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import launch_milvus, launch_es, launch_opensearch
from farm.file_utils import http_get
import logging
@ -36,11 +37,13 @@ def get_document_store(document_store_type, similarity='dot_product', index="doc
elif document_store_type == "memory":
document_store = InMemoryDocumentStore()
elif document_store_type == "elasticsearch":
launch_es()
# make sure we start from a fresh index
client = Elasticsearch()
client.indices.delete(index='haystack_test*', ignore=[404])
document_store = ElasticsearchDocumentStore(index="eval_document", similarity=similarity, timeout=3000)
elif document_store_type in ("milvus_flat", "milvus_hnsw"):
launch_milvus()
if document_store_type == "milvus_flat":
index_type = IndexType.FLAT
index_param = None
@ -57,7 +60,7 @@ def get_document_store(document_store_type, similarity='dot_product', index="doc
index=index
)
assert document_store.get_document_count(index="eval_document") == 0
elif document_store_type in("faiss_flat", "faiss_hnsw"):
elif document_store_type in ("faiss_flat", "faiss_hnsw"):
if document_store_type == "faiss_flat":
index_type = "Flat"
elif document_store_type == "faiss_hnsw":
@ -80,7 +83,13 @@ def get_document_store(document_store_type, similarity='dot_product', index="doc
index=index
)
assert document_store.get_document_count() == 0
elif document_store_type in ("opensearch_flat", "opensearch_hnsw"):
launch_opensearch()
if document_store_type == "opensearch_flat":
index_type = "flat"
elif document_store_type == "opensearch_hnsw":
index_type = "hnsw"
document_store = OpenSearchDocumentStore(index_type=index_type, timeout=3000)
else:
raise Exception(f"No document store fixture for '{document_store_type}'")
return document_store