mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-25 14:08:27 +00:00
Implement OpenSearch ANN (#1225)
* Simplify ODES init * Add arguments to ES init and create script * Rename similarity_fn_name and add util fn * Create OpenSearchDocumentStore * Specify params of Open Search HNSW * Add better argument handling * Update opensearch index mapping * Edit opensearch default port * Fix HNSW mapping * Force small HNSW params * Implement auto start and stopping of document store services * Fix starting and stopping of ds service * Restore HNSW params * Add opensearch query benchmarks * Add write wait time * Revert wait time * Add timeout * Update benchmarks * Update benchmarks * Update benchmarks json * Update documentation * Update documentation * Fix similarity name * Improve argument passing * Improve stopping and starting of service
This commit is contained in:
parent
4c2a0b914a
commit
363be65a78
@ -159,6 +159,46 @@
|
||||
"model": "BM25 / Elasticsearch",
|
||||
"n_docs": 1000,
|
||||
"map": 74.20444712972909
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (flat)",
|
||||
"n_docs": 1000,
|
||||
"map": 92.95105322830891
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (flat)",
|
||||
"n_docs": 10000,
|
||||
"map": 89.8709701490436
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (flat)",
|
||||
"n_docs": 100000,
|
||||
"map": 86.54014997282701
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (HNSW)",
|
||||
"n_docs": 1000,
|
||||
"map": 92.76308330349686
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (HNSW)",
|
||||
"n_docs": 10000,
|
||||
"map": 89.00403653862938
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (HNSW)",
|
||||
"n_docs": 100000,
|
||||
"map": 85.7342431384476
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (flat)",
|
||||
"n_docs": 500000,
|
||||
"map": 80.85588135082547
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (HNSW)",
|
||||
"n_docs": 500000,
|
||||
"map": 77.5426462347698
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -69,6 +69,20 @@
|
||||
"index_speed": 115.61076852516383,
|
||||
"query_speed": 38.80526238789059,
|
||||
"map": 81.63864883662649
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (flat)",
|
||||
"n_docs": 100000,
|
||||
"index_speed": 70.05381128388427,
|
||||
"query_speed": 15.306895223372484,
|
||||
"map": 86.54014997282701
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (HNSW)",
|
||||
"n_docs": 100000,
|
||||
"index_speed": 70.31004397719536,
|
||||
"query_speed": 24.95733865947408,
|
||||
"map": 85.7342431384476
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -159,6 +159,46 @@
|
||||
"model": "BM25 / Elasticsearch",
|
||||
"n_docs": 1000,
|
||||
"query_speed": 282.95914917837337
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (flat)",
|
||||
"n_docs": 1000,
|
||||
"query_speed": 29.061163356184426
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (flat)",
|
||||
"n_docs": 10000,
|
||||
"query_speed": 24.834414667596725
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (flat)",
|
||||
"n_docs": 100000,
|
||||
"query_speed": 15.306895223372484
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (HNSW)",
|
||||
"n_docs": 1000,
|
||||
"query_speed": 29.10621389658101
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (HNSW)",
|
||||
"n_docs": 10000,
|
||||
"query_speed": 26.92417300437131
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (HNSW)",
|
||||
"n_docs": 100000,
|
||||
"query_speed": 24.95733865947408
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (flat)",
|
||||
"n_docs": 500000,
|
||||
"query_speed": 11.33271222977541
|
||||
},
|
||||
{
|
||||
"model": "DPR / OpenSearch (HNSW)",
|
||||
"n_docs": 500000,
|
||||
"query_speed": 24.13921492357397
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -47,9 +47,9 @@ from haystack.document_store import ElasticsearchDocumentStore
|
||||
document_store = ElasticsearchDocumentStore()
|
||||
```
|
||||
|
||||
Note that we also support [Open Distro for Elasticsearch](https://opendistro.github.io/for-elasticsearch-docs/).
|
||||
Follow [their documentation](https://opendistro.github.io/for-elasticsearch-docs/docs/install/)
|
||||
to run it and connect to it using Haystack's `OpenDistroElasticsearchDocumentStore` class.
|
||||
Note that we also support [OpenSearch](https://opensearch.org/).
|
||||
Follow [their documentation](https://opensearch.org/docs/)
|
||||
to run it and connect to it using Haystack's `OpenSearchDocumentStore` class.
|
||||
|
||||
We further support [AWS Elastic Search Service](https://aws.amazon.com/elasticsearch-service/) with [signed Requests](https://docs.aws.amazon.com/general/latest/gr/signature-version-4.html):
|
||||
Use e.g. [aws-requests-auth](https://github.com/davidmuller/aws-requests-auth) to create an auth object and pass it as `aws4auth` to the `ElasticsearchDocumentStore` constructor.
|
||||
@ -143,6 +143,32 @@ document_store = WeaviateDocumentStore()
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tab">
|
||||
<input type="radio" id="tab-1-1" name="tab-group-1" checked>
|
||||
<label class="labelouter" for="tab-1-1">OpenSearch</label>
|
||||
<div class="tabcontent">
|
||||
|
||||
See the official [OpenSearch documentation](https://opensearch.org/docs/opensearch/install/docker/) on how to install and start an instance.
|
||||
|
||||
If you have Docker set up, we recommend pulling the Docker image and running it.
|
||||
```bash
|
||||
docker pull opensearchproject/opensearch:1.0.0
|
||||
docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.0.0
|
||||
```
|
||||
|
||||
Note that we also have a utility function `haystack.utils.launch_opensearch` that can start up an OpenSearch instance.
|
||||
|
||||
Next you can initialize the Haystack object that will connect to this instance.
|
||||
|
||||
```python
|
||||
from haystack.document_store import OpenSearchDocumentStore
|
||||
|
||||
document_store = OpenSearchDocumentStore()
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
Each DocumentStore constructor allows for arguments specifying how to connect to existing databases and the names of indexes.
|
||||
@ -303,6 +329,21 @@ The Document Stores have different characteristics. You should choose one depend
|
||||
- Less options for ANN algorithms than FAISS or Milvus
|
||||
- No BM25 / Tf-idf retrieval
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tab">
|
||||
<input type="radio" id="tab-2-6" name="tab-group-2">
|
||||
<label class="labelouter" for="tab-2-6">OpenSearch</label>
|
||||
<div class="tabcontent">
|
||||
|
||||
**Pros:**
|
||||
- Fully open source fork of Elasticsearch
|
||||
- Has support for Approximate Nearest Neighbours vector search
|
||||
|
||||
**Cons:**
|
||||
- It's ANN algorithms seem a little less performant that FAISS or Milvus in our benchmarks
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore, OpenDistroElasticsearchDocumentStore
|
||||
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore, OpenDistroElasticsearchDocumentStore, OpenSearchDocumentStore
|
||||
from haystack.document_store.faiss import FAISSDocumentStore
|
||||
from haystack.document_store.memory import InMemoryDocumentStore
|
||||
from haystack.document_store.milvus import MilvusDocumentStore
|
||||
|
||||
@ -48,6 +48,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
timeout=30,
|
||||
return_embedding: bool = False,
|
||||
duplicate_documents: str = 'overwrite',
|
||||
index_type: str = "flat"
|
||||
):
|
||||
"""
|
||||
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
||||
@ -95,6 +96,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
overwrite: Update any existing documents with the same ID when adding documents.
|
||||
fail: an error is raised if the document ID of the document being added already
|
||||
exists.
|
||||
:param index_type: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the
|
||||
ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does.
|
||||
|
||||
"""
|
||||
# save init parameters to enable export of component config as YAML
|
||||
@ -105,7 +108,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
custom_mapping=custom_mapping, excluded_meta_data=excluded_meta_data, analyzer=analyzer, scheme=scheme,
|
||||
ca_certs=ca_certs, verify_certs=verify_certs, create_index=create_index,
|
||||
duplicate_documents=duplicate_documents, refresh_type=refresh_type, similarity=similarity,
|
||||
timeout=timeout, return_embedding=return_embedding,
|
||||
timeout=timeout, return_embedding=return_embedding, index_type=index_type
|
||||
)
|
||||
|
||||
self.client = self._init_elastic_client(host=host, port=port, username=username, password=password,
|
||||
@ -131,10 +134,17 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
self.custom_mapping = custom_mapping
|
||||
self.index: str = index
|
||||
self.label_index: str = label_index
|
||||
if similarity in ["cosine", "dot_product"]:
|
||||
if similarity in ["cosine", "dot_product", "l2"]:
|
||||
self.similarity = similarity
|
||||
else:
|
||||
raise Exception("Invalid value for similarity in ElasticSearchDocumentStore constructor. Choose between 'cosine' and 'dot_product'")
|
||||
raise Exception(f"Invalid value {similarity} for similarity in ElasticSearchDocumentStore constructor. Choose between 'cosine', 'l2' and 'dot_product'")
|
||||
if index_type in ["flat", "hnsw"]:
|
||||
self.index_type = index_type
|
||||
else:
|
||||
raise Exception("Invalid value for index_type in constructor. Choose between 'flat' and 'hnsw'")
|
||||
if index_type == "hnsw" and type(self) == ElasticsearchDocumentStore:
|
||||
raise Exception("The HNSW algorithm for approximate nearest neighbours calculation is currently not available in the ElasticSearchDocumentStore. "
|
||||
"Try the OpenSearchDocumentStore instead.")
|
||||
if create_index:
|
||||
self._create_document_index(index)
|
||||
self._create_label_index(label_index)
|
||||
@ -142,6 +152,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
self.duplicate_documents = duplicate_documents
|
||||
self.refresh_type = refresh_type
|
||||
|
||||
|
||||
def _init_elastic_client(self,
|
||||
host: Union[str, List[str]],
|
||||
port: Union[int, List[int]],
|
||||
@ -356,7 +367,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
return buckets
|
||||
|
||||
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None,
|
||||
batch_size: int = 10_000,duplicate_documents: Optional[str] = None):
|
||||
batch_size: int = 10_000, duplicate_documents: Optional[str] = None):
|
||||
"""
|
||||
Indexes documents for later queries in Elasticsearch.
|
||||
|
||||
@ -805,7 +816,10 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
if self.similarity == "cosine":
|
||||
similarity_fn_name = "cosineSimilarity"
|
||||
elif self.similarity == "dot_product":
|
||||
similarity_fn_name = "dotProduct"
|
||||
if type(self) == OpenSearchDocumentStore:
|
||||
similarity_fn_name = "innerproduct"
|
||||
elif type(self) == ElasticsearchDocumentStore:
|
||||
similarity_fn_name = "dotProduct"
|
||||
else:
|
||||
raise Exception("Invalid value for similarity in ElasticSearchDocumentStore constructor. Choose between \'cosine\' and \'dot_product\'")
|
||||
|
||||
@ -997,14 +1011,34 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
if self.refresh_type == "wait_for":
|
||||
time.sleep(2)
|
||||
|
||||
class OpenDistroElasticsearchDocumentStore(ElasticsearchDocumentStore):
|
||||
|
||||
class OpenSearchDocumentStore(ElasticsearchDocumentStore):
|
||||
"""
|
||||
Document Store using the Open Distro for Elasticsearch. It is compatible with the AWS Elasticsearch Service.
|
||||
Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service.
|
||||
|
||||
In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using
|
||||
the KNN plugin that can scale to a large number of documents.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
verify_certs=False,
|
||||
scheme="https",
|
||||
username="admin",
|
||||
password="admin",
|
||||
port=9201,
|
||||
**kwargs):
|
||||
|
||||
# Overwrite default kwarg values of parent class so that in default cases we can initialize
|
||||
# an OpenSearchDocumentStore without provding any arguments
|
||||
|
||||
super(OpenSearchDocumentStore, self).__init__(verify_certs=verify_certs,
|
||||
scheme=scheme,
|
||||
username=username,
|
||||
password=password,
|
||||
port=port,
|
||||
**kwargs)
|
||||
|
||||
|
||||
def _create_document_index(self, index_name: str):
|
||||
"""
|
||||
Create a new index for storing documents.
|
||||
@ -1038,21 +1072,40 @@ class OpenDistroElasticsearchDocumentStore(ElasticsearchDocumentStore):
|
||||
}
|
||||
}
|
||||
if self.embedding_field:
|
||||
|
||||
if self.similarity == "cosine":
|
||||
similarity_space_type = "cosinesimil"
|
||||
elif self.similarity == "dot_product":
|
||||
similarity_space_type = "innerproduct"
|
||||
elif self.similarity == "l2":
|
||||
similarity_space_type = "l2"
|
||||
else:
|
||||
raise Exception(
|
||||
f"Similarity function {self.similarity} is not supported by OpenDistroElasticsearchDocumentStore."
|
||||
)
|
||||
mapping["settings"]["knn"] = True
|
||||
mapping["settings"]["knn.space_type"] = similarity_space_type
|
||||
|
||||
mapping["settings"]["index"] = {}
|
||||
mapping["settings"]["index"]["knn"] = True
|
||||
mapping["settings"]["index"]["knn.space_type"] = similarity_space_type
|
||||
|
||||
mapping["mappings"]["properties"][self.embedding_field] = {
|
||||
"type": "knn_vector",
|
||||
"dimension": self.embedding_dim,
|
||||
}
|
||||
|
||||
if self.index_type == "flat":
|
||||
pass
|
||||
elif self.index_type == "hnsw":
|
||||
mapping["settings"]["index"]["knn.algo_param"] = {}
|
||||
mapping["settings"]["index"]["knn.algo_param"]["ef_search"] = 20
|
||||
mapping["mappings"]["properties"][self.embedding_field]["method"] = {
|
||||
"space_type": similarity_space_type,
|
||||
"name": "hnsw",
|
||||
"engine": "nmslib",
|
||||
"parameters": {
|
||||
"ef_construction": 80,
|
||||
"m": 64
|
||||
}
|
||||
}
|
||||
else:
|
||||
logger.error("Please set index_type to either 'flat' or 'hnsw'")
|
||||
|
||||
try:
|
||||
self.client.indices.create(index=index_name, body=mapping)
|
||||
except RequestError as e:
|
||||
@ -1063,6 +1116,7 @@ class OpenDistroElasticsearchDocumentStore(ElasticsearchDocumentStore):
|
||||
if not self.client.indices.exists(index=index_name):
|
||||
raise e
|
||||
|
||||
|
||||
def _get_vector_similarity_query(self, query_emb: np.ndarray, top_k: int):
|
||||
"""
|
||||
Generate Elasticsearch query for vector similarity.
|
||||
@ -1072,3 +1126,10 @@ class OpenDistroElasticsearchDocumentStore(ElasticsearchDocumentStore):
|
||||
|
||||
def _scale_embedding_score(self, score):
|
||||
return score
|
||||
|
||||
|
||||
class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore):
|
||||
def __init__(self):
|
||||
logger.warning("Open Distro for Elasticsearch has been replaced by OpenSearch! "
|
||||
"See https://opensearch.org/faq/ for details. "
|
||||
"We recommend using the OpenSearchDocumentStore instead.")
|
||||
@ -118,7 +118,7 @@ class MilvusDocumentStore(SQLDocumentStore):
|
||||
self.similarity = similarity
|
||||
else:
|
||||
raise ValueError("The Milvus document store can currently only support dot_product and L2 similarity. "
|
||||
"Please set similarity=\"dot_product\"")
|
||||
"Please set similarity=\"dot_product\" or \"l2\"")
|
||||
|
||||
self.index_type = index_type
|
||||
self.index_param = index_param or {"nlist": 16384}
|
||||
|
||||
@ -14,10 +14,8 @@ import torch
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def launch_es():
|
||||
# Start an Elasticsearch server
|
||||
# You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in
|
||||
# your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.
|
||||
def launch_es(sleep=15):
|
||||
# Start an Elasticsearch server via Docker
|
||||
|
||||
logger.info("Starting Elasticsearch ...")
|
||||
status = subprocess.run(
|
||||
@ -27,13 +25,59 @@ def launch_es():
|
||||
logger.warning("Tried to start Elasticsearch through Docker but this failed. "
|
||||
"It is likely that there is already an existing Elasticsearch instance running. ")
|
||||
else:
|
||||
time.sleep(15)
|
||||
time.sleep(sleep)
|
||||
|
||||
def launch_open_distro_es(sleep=15):
|
||||
# Start an Open Distro for Elasticsearch server via Docker
|
||||
|
||||
logger.info("Starting Open Distro for Elasticsearch ...")
|
||||
status = subprocess.run(
|
||||
['docker run -d -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" amazon/opendistro-for-elasticsearch:1.13.2'], shell=True
|
||||
)
|
||||
if status.returncode:
|
||||
logger.warning("Tried to start Open Distro for Elasticsearch through Docker but this failed. "
|
||||
"It is likely that there is already an existing Elasticsearch instance running. ")
|
||||
else:
|
||||
time.sleep(sleep)
|
||||
|
||||
def launch_opensearch(sleep=15):
|
||||
# Start an OpenSearch server via docker
|
||||
|
||||
logger.info("Starting OpenSearch...")
|
||||
# This line is needed since it is not possible to start a new docker container with the name opensearch if there is a stopped image with the same now
|
||||
# docker rm only succeeds if the container is stopped, not if it is running
|
||||
_ = subprocess.run(['docker rm opensearch'], shell=True, stdout=subprocess.DEVNULL)
|
||||
status = subprocess.run(
|
||||
['docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" --name opensearch opensearchproject/opensearch:1.0.0-rc1'],
|
||||
shell=True
|
||||
)
|
||||
if status.returncode:
|
||||
logger.warning("Tried to start OpenSearch through Docker but this failed. "
|
||||
"It is likely that there is already an existing OpenSearch instance running. ")
|
||||
else:
|
||||
time.sleep(sleep)
|
||||
|
||||
|
||||
def launch_milvus():
|
||||
# Start a Milvus server
|
||||
# You can start Milvus on your local machine instance using Docker. If Docker is not readily available in
|
||||
# your environment (eg., in Colab notebooks)
|
||||
def stop_opensearch():
|
||||
logger.info("Stopping OpenSearch...")
|
||||
status = subprocess.run(['docker stop opensearch'], shell=True)
|
||||
if status.returncode:
|
||||
logger.warning("Tried to stop OpenSearch but this failed. "
|
||||
"It is likely that there was no OpenSearch Docker container with the name opensearch")
|
||||
status = subprocess.run(['docker rm opensearch'], shell=True)
|
||||
|
||||
|
||||
def stop_service(document_store):
|
||||
ds_class = str(type(document_store))
|
||||
if "OpenSearchDocumentStore" in ds_class:
|
||||
stop_opensearch()
|
||||
else:
|
||||
logger.warning(f"No support yet for auto stopping the service behind a {ds_class}")
|
||||
|
||||
|
||||
def launch_milvus(sleep=15):
|
||||
# Start a Milvus server via docker
|
||||
|
||||
logger.info("Starting Milvus ...")
|
||||
logger.warning("Automatic Milvus config creation not yet implemented. "
|
||||
"If you are starting Milvus using launch_milvus(), "
|
||||
@ -55,7 +99,7 @@ def launch_milvus():
|
||||
logger.warning("Tried to start Milvus through Docker but this failed. "
|
||||
"It is likely that there is already an existing Milvus instance running. ")
|
||||
else:
|
||||
time.sleep(15)
|
||||
time.sleep(sleep)
|
||||
|
||||
|
||||
def print_answers(results: dict, details: str = "all"):
|
||||
@ -195,6 +239,7 @@ def get_batches_from_generator(iterable, n):
|
||||
"""
|
||||
Batch elements of an iterable into fixed-length chunks or blocks.
|
||||
"""
|
||||
# TODO consider moving to base.DocumentStore
|
||||
it = iter(iterable)
|
||||
x = tuple(islice(it, n))
|
||||
while x:
|
||||
|
||||
@ -6,6 +6,18 @@
|
||||
"elastic",
|
||||
"elasticsearch"
|
||||
],
|
||||
[
|
||||
"elastic",
|
||||
"opensearch_flat"
|
||||
],
|
||||
[
|
||||
"dpr",
|
||||
"opensearch_flat"
|
||||
],
|
||||
[
|
||||
"dpr",
|
||||
"opensearch_hnsw"
|
||||
],
|
||||
[
|
||||
"dpr",
|
||||
"elasticsearch"
|
||||
|
||||
@ -47,7 +47,9 @@ def retriever(index_csv="retriever_index_results.csv", query_csv="retriever_quer
|
||||
"faiss_hnsw": "FAISS (HNSW)",
|
||||
"milvus_flat": "Milvus (flat)",
|
||||
"milvus_hnsw": "Milvus (HNSW)",
|
||||
"sentence_transformers": "Sentence Transformers"
|
||||
"sentence_transformers": "Sentence Transformers",
|
||||
"opensearch_flat": "OpenSearch (flat)",
|
||||
"opensearch_hnsw": "OpenSearch (HNSW)"
|
||||
}
|
||||
|
||||
index = pd.read_csv(index_csv)
|
||||
|
||||
@ -19,6 +19,7 @@ from farm.file_utils import http_get
|
||||
import json
|
||||
from results_to_json import retriever as retriever_json
|
||||
from templates import RETRIEVER_TEMPLATE, RETRIEVER_MAP_TEMPLATE, RETRIEVER_SPEED_TEMPLATE
|
||||
from haystack.utils import stop_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.getLogger("haystack.retriever.base").setLevel(logging.WARN)
|
||||
@ -86,6 +87,7 @@ def benchmark_indexing(n_docs_options, retriever_doc_stores, data_dir, filename_
|
||||
with open(md_file, "w") as f:
|
||||
f.write(str(retriever_df.to_markdown()))
|
||||
time.sleep(10)
|
||||
stop_service(doc_store)
|
||||
del doc_store
|
||||
del retriever
|
||||
|
||||
@ -108,6 +110,7 @@ def benchmark_indexing(n_docs_options, retriever_doc_stores, data_dir, filename_
|
||||
doc_store.delete_all_documents(index=doc_index)
|
||||
doc_store.delete_all_documents(index=label_index)
|
||||
time.sleep(10)
|
||||
stop_service(doc_store)
|
||||
del doc_store
|
||||
del retriever
|
||||
if update_json:
|
||||
@ -126,6 +129,7 @@ def benchmark_querying(n_docs_options,
|
||||
embeddings_dir,
|
||||
update_json,
|
||||
save_markdown,
|
||||
wait_write_limit=100,
|
||||
**kwargs):
|
||||
""" Benchmark the time it takes to perform querying. Doc embeddings are loaded from file."""
|
||||
retriever_results = []
|
||||
@ -153,6 +157,7 @@ def benchmark_querying(n_docs_options,
|
||||
add_precomputed=add_precomputed)
|
||||
logger.info("Start indexing...")
|
||||
index_to_doc_store(doc_store, docs, retriever, labels)
|
||||
|
||||
logger.info("Start queries...")
|
||||
|
||||
raw_results = retriever.eval()
|
||||
@ -178,6 +183,7 @@ def benchmark_querying(n_docs_options,
|
||||
doc_store.delete_all_documents(index=doc_index)
|
||||
doc_store.delete_all_documents(index=label_index)
|
||||
time.sleep(5)
|
||||
stop_service(doc_store)
|
||||
del doc_store
|
||||
del retriever
|
||||
except Exception:
|
||||
|
||||
@ -26,4 +26,7 @@
|
||||
0,sentence_transformers,elasticsearch,1000,10.380210993000219,96.33715544648746,2021-06-02 08:49:29.922794,
|
||||
1,sentence_transformers,elasticsearch,10000,82.89545158599958,120.63388049253265,2021-06-02 08:51:09.796056,
|
||||
2,sentence_transformers,elasticsearch,100000,836.6144149759998,119.52937722555106,2021-06-02 09:05:26.454063,
|
||||
3,sentence_transformers,elasticsearch,500000,4207.770141414,118.82778364694073,2021-06-02 10:16:20.514575,
|
||||
3,sentence_transformers,elasticsearch,500000,4207.770141414,118.82778364694073,2021-06-02 10:16:20.514575,
|
||||
1,dpr,opensearch_flat,100000,1427.47408267,70.05381128388427,2021-07-22 12:33:02.890691,
|
||||
0,elastic,opensearch_flat,100000,207.3902409509992,482.18276588833,2021-07-22 12:08:18.041527,
|
||||
2,dpr,opensearch_hnsw,100000,1422.2719023249992,70.31004397719536,2021-07-22 12:57:54.770107,
|
||||
|
||||
|
@ -27,3 +27,11 @@
|
||||
3,dpr,milvus_hnsw,100000,5637,145.2638032350751,38.80526238789059,0.0257697007690394,90.01241795281177,81.63864883662649,10,2021-06-01 12:07:43.734963,
|
||||
10,dpr,milvus_hnsw,500000,5637,151.7069119141779,37.15717318924075,0.02691270390530032,84.1759801312755,73.57986207906387,10,2021-06-01 16:30:55.573386,
|
||||
0,elastic,elasticsearch,1000,1064,3.760260104999361,282.95914917837337,0.003534079046052031,89.09774436090225,74.2044471297291,10,2021-06-02 08:27:07.187435,
|
||||
0,dpr,opensearch_flat,1000,1064,36.61243656901206,29.061163356184426,0.034410184745312086,99.15413533834587,92.9510532283089,10,2021-07-19 15:25:29.480947,
|
||||
2,dpr,opensearch_flat,10000,5637,226.98340490202918,24.834414667596725,0.040266703016148514,97.49866950505589,89.87097014904359,10,2021-07-19 15:32:29.913507,
|
||||
4,dpr,opensearch_flat,100000,5637,368.2654070430117,15.306895223372486,0.06533003495529745,95.77789604399504,86.54014997282702,10,2021-07-19 15:49:18.090106,
|
||||
1,dpr,opensearch_hnsw,1000,1064,36.55576791198473,29.106213896581007,0.03435692472930896,98.96616541353383,92.76308330349686,10,2021-07-19 15:27:04.463343,
|
||||
3,dpr,opensearch_hnsw,10000,5637,209.3657621010234,26.924173004371312,0.03714134505961032,96.41653361717225,89.00403653862938,10,2021-07-19 15:37:40.060081,
|
||||
5,dpr,opensearch_hnsw,100000,5637,225.86542887897758,24.95733865947408,0.040068374823306295,94.8199396842292,85.7342431384476,10,2021-07-19 15:58:37.854825,
|
||||
0,dpr,opensearch_flat,500000,5637,497.4096126070708,11.33271222977541,0.0882401299640005,93.06368635799184,80.85588135082547,10,2021-07-21 13:12:13.891855,
|
||||
1,dpr,opensearch_hnsw,500000,5637,233.5204362630284,24.139214923573974,0.04142636797286294,88.96576193010468,77.54264623476982,10,2021-07-21 13:39:28.533054,
|
||||
|
@ -1,13 +1,14 @@
|
||||
import os
|
||||
from haystack.document_store.sql import SQLDocumentStore
|
||||
from haystack.document_store.memory import InMemoryDocumentStore
|
||||
from haystack.document_store.elasticsearch import Elasticsearch, ElasticsearchDocumentStore
|
||||
from haystack.document_store.elasticsearch import Elasticsearch, ElasticsearchDocumentStore, OpenSearchDocumentStore
|
||||
from haystack.document_store.faiss import FAISSDocumentStore
|
||||
from haystack.document_store.milvus import MilvusDocumentStore, IndexType
|
||||
from haystack.retriever.sparse import ElasticsearchRetriever, TfidfRetriever
|
||||
from haystack.retriever.dense import DensePassageRetriever, EmbeddingRetriever
|
||||
from haystack.reader.farm import FARMReader
|
||||
from haystack.reader.transformers import TransformersReader
|
||||
from haystack.utils import launch_milvus, launch_es, launch_opensearch
|
||||
from farm.file_utils import http_get
|
||||
|
||||
import logging
|
||||
@ -36,11 +37,13 @@ def get_document_store(document_store_type, similarity='dot_product', index="doc
|
||||
elif document_store_type == "memory":
|
||||
document_store = InMemoryDocumentStore()
|
||||
elif document_store_type == "elasticsearch":
|
||||
launch_es()
|
||||
# make sure we start from a fresh index
|
||||
client = Elasticsearch()
|
||||
client.indices.delete(index='haystack_test*', ignore=[404])
|
||||
document_store = ElasticsearchDocumentStore(index="eval_document", similarity=similarity, timeout=3000)
|
||||
elif document_store_type in ("milvus_flat", "milvus_hnsw"):
|
||||
launch_milvus()
|
||||
if document_store_type == "milvus_flat":
|
||||
index_type = IndexType.FLAT
|
||||
index_param = None
|
||||
@ -57,7 +60,7 @@ def get_document_store(document_store_type, similarity='dot_product', index="doc
|
||||
index=index
|
||||
)
|
||||
assert document_store.get_document_count(index="eval_document") == 0
|
||||
elif document_store_type in("faiss_flat", "faiss_hnsw"):
|
||||
elif document_store_type in ("faiss_flat", "faiss_hnsw"):
|
||||
if document_store_type == "faiss_flat":
|
||||
index_type = "Flat"
|
||||
elif document_store_type == "faiss_hnsw":
|
||||
@ -80,7 +83,13 @@ def get_document_store(document_store_type, similarity='dot_product', index="doc
|
||||
index=index
|
||||
)
|
||||
assert document_store.get_document_count() == 0
|
||||
|
||||
elif document_store_type in ("opensearch_flat", "opensearch_hnsw"):
|
||||
launch_opensearch()
|
||||
if document_store_type == "opensearch_flat":
|
||||
index_type = "flat"
|
||||
elif document_store_type == "opensearch_hnsw":
|
||||
index_type = "hnsw"
|
||||
document_store = OpenSearchDocumentStore(index_type=index_type, timeout=3000)
|
||||
else:
|
||||
raise Exception(f"No document store fixture for '{document_store_type}'")
|
||||
return document_store
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user