mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-03 03:09:28 +00:00
feat: Add IVF and Product Quantization support for OpenSearchDocumentStore (#3850)
* Add IVF and Product Quantization support for OpenSearchDocumentStore * Remove unused import statement * Fix mypy * Adapt doc strings and error messages to account for PQ * Adapt validation of indices * Adapt existing tests * Fix pylint * Add tests * Update lg * Adapt based on PR review comments * Fix Pylint * Adapt based on PR review * Add request_timeout * Adapt based on PR review * Adapt based on PR review * Adapt tests * Pin tenacity * Unpin tenacity * Adapt based on PR comments * Add match to tests --------- Co-authored-by: agnieszka-m <amarzec13@gmail.com>
This commit is contained in:
parent
8370715e7c
commit
7eeb3e07bf
@ -51,7 +51,6 @@ class ElasticsearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
timeout: int = 30,
|
timeout: int = 30,
|
||||||
return_embedding: bool = False,
|
return_embedding: bool = False,
|
||||||
duplicate_documents: str = "overwrite",
|
duplicate_documents: str = "overwrite",
|
||||||
index_type: str = "flat",
|
|
||||||
scroll: str = "1d",
|
scroll: str = "1d",
|
||||||
skip_missing_embeddings: bool = True,
|
skip_missing_embeddings: bool = True,
|
||||||
synonyms: Optional[List] = None,
|
synonyms: Optional[List] = None,
|
||||||
@ -113,8 +112,6 @@ class ElasticsearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
overwrite: Update any existing documents with the same ID when adding documents.
|
overwrite: Update any existing documents with the same ID when adding documents.
|
||||||
fail: an error is raised if the document ID of the document being added already
|
fail: an error is raised if the document ID of the document being added already
|
||||||
exists.
|
exists.
|
||||||
:param index_type: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the
|
|
||||||
ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does.
|
|
||||||
:param scroll: Determines how long the current index is fixed, e.g. during updating all documents with embeddings.
|
:param scroll: Determines how long the current index is fixed, e.g. during updating all documents with embeddings.
|
||||||
Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h"
|
Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h"
|
||||||
For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html
|
For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html
|
||||||
@ -132,13 +129,6 @@ class ElasticsearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
:param use_system_proxy: Whether to use system proxy.
|
:param use_system_proxy: Whether to use system proxy.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# hnsw is only supported in OpensearchDocumentStore
|
|
||||||
if index_type == "hnsw":
|
|
||||||
raise DocumentStoreError(
|
|
||||||
"The HNSW algorithm for approximate nearest neighbours calculation is currently not available in the ElasticSearchDocumentStore. "
|
|
||||||
"Try the OpenSearchDocumentStore instead."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Base constructor might need the client to be ready, create it first
|
# Base constructor might need the client to be ready, create it first
|
||||||
client = self._init_elastic_client(
|
client = self._init_elastic_client(
|
||||||
host=host,
|
host=host,
|
||||||
@ -173,7 +163,6 @@ class ElasticsearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
similarity=similarity,
|
similarity=similarity,
|
||||||
return_embedding=return_embedding,
|
return_embedding=return_embedding,
|
||||||
duplicate_documents=duplicate_documents,
|
duplicate_documents=duplicate_documents,
|
||||||
index_type=index_type,
|
|
||||||
scroll=scroll,
|
scroll=scroll,
|
||||||
skip_missing_embeddings=skip_missing_embeddings,
|
skip_missing_embeddings=skip_missing_embeddings,
|
||||||
synonyms=synonyms,
|
synonyms=synonyms,
|
||||||
|
|||||||
@ -459,7 +459,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
|||||||
|
|
||||||
def train_index(
|
def train_index(
|
||||||
self,
|
self,
|
||||||
documents: Optional[Union[List[dict], List[Document]]],
|
documents: Optional[Union[List[dict], List[Document]]] = None,
|
||||||
embeddings: Optional[np.ndarray] = None,
|
embeddings: Optional[np.ndarray] = None,
|
||||||
index: Optional[str] = None,
|
index: Optional[str] = None,
|
||||||
):
|
):
|
||||||
@ -474,15 +474,20 @@ class FAISSDocumentStore(SQLDocumentStore):
|
|||||||
:return: None
|
:return: None
|
||||||
"""
|
"""
|
||||||
index = index or self.index
|
index = index or self.index
|
||||||
if embeddings and documents:
|
if isinstance(embeddings, np.ndarray) and documents:
|
||||||
raise ValueError("Either pass `documents` or `embeddings`. You passed both.")
|
raise ValueError("Either pass `documents` or `embeddings`. You passed both.")
|
||||||
|
|
||||||
if documents:
|
if documents:
|
||||||
document_objects = [Document.from_dict(d) if isinstance(d, dict) else d for d in documents]
|
document_objects = [Document.from_dict(d) if isinstance(d, dict) else d for d in documents]
|
||||||
doc_embeddings = [doc.embedding for doc in document_objects]
|
doc_embeddings = [doc.embedding for doc in document_objects if doc.embedding is not None]
|
||||||
embeddings_for_train = np.array(doc_embeddings, dtype="float32")
|
embeddings_for_train = np.array(doc_embeddings, dtype="float32")
|
||||||
self.faiss_indexes[index].train(embeddings_for_train)
|
self.faiss_indexes[index].train(embeddings_for_train)
|
||||||
if embeddings:
|
elif isinstance(embeddings, np.ndarray):
|
||||||
self.faiss_indexes[index].train(embeddings)
|
self.faiss_indexes[index].train(embeddings)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
"When calling `train_index`, you must provide either Documents or embeddings. Because none of these values was provided, no training will be performed. "
|
||||||
|
)
|
||||||
|
|
||||||
def delete_all_documents(
|
def delete_all_documents(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import logging
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
from tenacity import retry, wait_exponential, retry_if_not_result
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from opensearchpy import OpenSearch, Urllib3HttpConnection, RequestsHttpConnection, NotFoundError, RequestError
|
from opensearchpy import OpenSearch, Urllib3HttpConnection, RequestsHttpConnection, NotFoundError, RequestError
|
||||||
@ -33,6 +34,8 @@ SIMILARITY_SPACE_TYPE_MAPPINGS = {
|
|||||||
|
|
||||||
|
|
||||||
class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
||||||
|
valid_index_types = ["flat", "hnsw", "ivf", "ivf_pq"]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
scheme: str = "https", # Mind this different default param
|
scheme: str = "https", # Mind this different default param
|
||||||
@ -69,6 +72,8 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
synonym_type: str = "synonym",
|
synonym_type: str = "synonym",
|
||||||
use_system_proxy: bool = False,
|
use_system_proxy: bool = False,
|
||||||
knn_engine: str = "nmslib",
|
knn_engine: str = "nmslib",
|
||||||
|
knn_parameters: Optional[Dict] = None,
|
||||||
|
ivf_train_size: Optional[int] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Document Store using OpenSearch (https://opensearch.org/). It is compatible with the Amazon OpenSearch Service.
|
Document Store using OpenSearch (https://opensearch.org/). It is compatible with the Amazon OpenSearch Service.
|
||||||
@ -119,11 +124,15 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
overwrite: Update any existing documents with the same ID when adding documents.
|
overwrite: Update any existing documents with the same ID when adding documents.
|
||||||
fail: an error is raised if the document ID of the document being added already
|
fail: an error is raised if the document ID of the document being added already
|
||||||
exists.
|
exists.
|
||||||
:param index_type: The type of index to be created. Choose from 'flat' and 'hnsw'.
|
:param index_type: The type of index you want to create. Choose from 'flat', 'hnsw', 'ivf', or 'ivf_pq'.
|
||||||
As OpenSearch currently does not support all similarity functions (e.g. dot_product) in exact vector similarity calculations,
|
'ivf_pq' is an IVF index optimized for memory through product quantization.
|
||||||
we don't make use of exact vector similarity when index_type='flat'. Instead we use the same approximate vector similarity calculations like in 'hnsw', but further optimized for accuracy.
|
('ivf' and 'ivf_pq' are only available with 'faiss' as knn_engine.)
|
||||||
Exact vector similarity is only used as fallback when there's a mismatch between certain requested and indexed similarity types.
|
If index_type='flat', we use OpenSearch's default index settings (which is an hnsw index
|
||||||
In these cases however, a warning will be displayed. See similarity param for more information.
|
optimized for accuracy and memory footprint), since OpenSearch does not require a special
|
||||||
|
index for exact vector similarity calculations. Note that OpenSearchDocumentStore will only
|
||||||
|
perform exact vector calculations if the selected knn_engine supports it (currently only
|
||||||
|
knn_engine='score_script'). For the other knn_engines we use hnsw, as this usually achieves
|
||||||
|
the best balance between nearly as good accuracy and latency.
|
||||||
:param scroll: Determines how long the current index is fixed, e.g. during updating all documents with embeddings.
|
:param scroll: Determines how long the current index is fixed, e.g. during updating all documents with embeddings.
|
||||||
Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h"
|
Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h"
|
||||||
For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html
|
For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html
|
||||||
@ -140,6 +149,22 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html
|
More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html
|
||||||
:param knn_engine: The engine you want to use for the nearest neighbor search by OpenSearch's KNN plug-in. Possible values: "nmslib", "faiss" or "score_script". Defaults to "nmslib".
|
:param knn_engine: The engine you want to use for the nearest neighbor search by OpenSearch's KNN plug-in. Possible values: "nmslib", "faiss" or "score_script". Defaults to "nmslib".
|
||||||
For more information, see [k-NN Index](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/).
|
For more information, see [k-NN Index](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/).
|
||||||
|
:param knn_parameters: Custom parameters for the KNN engine. Parameter names depend on the index type you use.
|
||||||
|
Configurable parameters for indices of type...
|
||||||
|
- `hnsw`: `"ef_construction"`, `"ef_search"`, `"m"`
|
||||||
|
- `ivf`: `"nlist"`, `"nprobes"`
|
||||||
|
- `ivf_pq`: `"nlist"`, `"nprobes"`, `"m"`, `"code_size"`
|
||||||
|
If you don't specify any parameters, the OpenSearch's default values are used.
|
||||||
|
(With the exception of index_type='hnsw', where we use values other than OpenSearch's
|
||||||
|
default ones to achieve comparability throughout DocumentStores in Haystack.)
|
||||||
|
For more information on configuration of knn indices, see
|
||||||
|
[OpenSearch Documentation](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/#method-definitions).
|
||||||
|
:param ivf_train_size: Number of embeddings to use for training the IVF index. Training starts automatically
|
||||||
|
once the number of indexed embeddings exceeds ivf_train_size. If `None`, the minimum
|
||||||
|
number of embeddings recommended for training by FAISS is used (depends on the desired
|
||||||
|
index type and knn parameters). If `0`, training doesn't happen automatically but needs
|
||||||
|
to be triggered manually via the `train_index` method.
|
||||||
|
Default: `None`
|
||||||
"""
|
"""
|
||||||
# These parameters aren't used by Opensearch at the moment but could be in the future, see
|
# These parameters aren't used by Opensearch at the moment but could be in the future, see
|
||||||
# https://github.com/opensearch-project/security/issues/1504. Let's not deprecate them for
|
# https://github.com/opensearch-project/security/issues/1504. Let's not deprecate them for
|
||||||
@ -178,7 +203,23 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
if knn_engine not in {"nmslib", "faiss", "score_script"}:
|
if knn_engine not in {"nmslib", "faiss", "score_script"}:
|
||||||
raise ValueError(f"knn_engine must be either 'nmslib', 'faiss' or 'score_script' but was {knn_engine}")
|
raise ValueError(f"knn_engine must be either 'nmslib', 'faiss' or 'score_script' but was {knn_engine}")
|
||||||
|
|
||||||
|
if index_type in self.valid_index_types:
|
||||||
|
if index_type in ["ivf", "ivf_pq"] and knn_engine != "faiss":
|
||||||
|
raise DocumentStoreError("Use 'faiss' as knn_engine when using 'ivf' as index_type.")
|
||||||
|
self.index_type = index_type
|
||||||
|
else:
|
||||||
|
raise DocumentStoreError(
|
||||||
|
f"Invalid value for index_type in constructor. Choose one of these values: {self.valid_index_types}."
|
||||||
|
)
|
||||||
|
|
||||||
self.knn_engine = knn_engine
|
self.knn_engine = knn_engine
|
||||||
|
self.knn_parameters = {} if knn_parameters is None else knn_parameters
|
||||||
|
if ivf_train_size is not None:
|
||||||
|
if ivf_train_size <= 0:
|
||||||
|
raise DocumentStoreError("`ivf_train_on_write_size` must be None or a positive integer.")
|
||||||
|
self.ivf_train_size = ivf_train_size
|
||||||
|
elif self.index_type in ["ivf", "ivf_pq"]:
|
||||||
|
self.ivf_train_size = self._recommended_ivf_train_size()
|
||||||
self.space_type = SIMILARITY_SPACE_TYPE_MAPPINGS[knn_engine][similarity]
|
self.space_type = SIMILARITY_SPACE_TYPE_MAPPINGS[knn_engine][similarity]
|
||||||
super().__init__(
|
super().__init__(
|
||||||
client=client,
|
client=client,
|
||||||
@ -198,7 +239,6 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
similarity=similarity,
|
similarity=similarity,
|
||||||
return_embedding=return_embedding,
|
return_embedding=return_embedding,
|
||||||
duplicate_documents=duplicate_documents,
|
duplicate_documents=duplicate_documents,
|
||||||
index_type=index_type,
|
|
||||||
scroll=scroll,
|
scroll=scroll,
|
||||||
skip_missing_embeddings=skip_missing_embeddings,
|
skip_missing_embeddings=skip_missing_embeddings,
|
||||||
synonyms=synonyms,
|
synonyms=synonyms,
|
||||||
@ -315,6 +355,9 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
:raises DuplicateDocumentError: Exception trigger on duplicate document
|
:raises DuplicateDocumentError: Exception trigger on duplicate document
|
||||||
:return: None
|
:return: None
|
||||||
"""
|
"""
|
||||||
|
if index is None:
|
||||||
|
index = self.index
|
||||||
|
|
||||||
if self.knn_engine == "faiss" and self.similarity == "cosine":
|
if self.knn_engine == "faiss" and self.similarity == "cosine":
|
||||||
field_map = self._create_document_field_map()
|
field_map = self._create_document_field_map()
|
||||||
documents = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents]
|
documents = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents]
|
||||||
@ -331,6 +374,16 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
headers=headers,
|
headers=headers,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Train IVF index if number of embeddings exceeds ivf_train_size
|
||||||
|
if (
|
||||||
|
self.index_type in ["ivf", "ivf_pq"]
|
||||||
|
and not index.startswith(".")
|
||||||
|
and not self._ivf_model_exists(index=index)
|
||||||
|
):
|
||||||
|
if self.get_embedding_count(index=index, headers=headers) >= self.ivf_train_size:
|
||||||
|
train_docs = self.get_all_documents(index=index, return_embedding=True, headers=headers)
|
||||||
|
self._train_ivf_index(index=index, documents=train_docs, headers=headers)
|
||||||
|
|
||||||
def _embed_documents(self, documents: List[Document], retriever: DenseRetriever) -> np.ndarray:
|
def _embed_documents(self, documents: List[Document], retriever: DenseRetriever) -> np.ndarray:
|
||||||
"""
|
"""
|
||||||
Embed a list of documents using a Retriever.
|
Embed a list of documents using a Retriever.
|
||||||
@ -438,6 +491,9 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
if return_embedding is None:
|
if return_embedding is None:
|
||||||
return_embedding = self.return_embedding
|
return_embedding = self.return_embedding
|
||||||
|
|
||||||
|
if self.index_type in ["ivf", "ivf_pq"] and not self._ivf_model_exists(index=index):
|
||||||
|
self._ivf_index_not_trained_error(index=index, headers=headers)
|
||||||
|
|
||||||
if not self.embedding_field:
|
if not self.embedding_field:
|
||||||
raise DocumentStoreError("Please set a valid `embedding_field` for OpenSearchDocumentStore")
|
raise DocumentStoreError("Please set a valid `embedding_field` for OpenSearchDocumentStore")
|
||||||
body = self._construct_dense_query_body(
|
body = self._construct_dense_query_body(
|
||||||
@ -451,8 +507,117 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
self._convert_es_hit_to_document(hit, adapt_score_for_embedding=True, scale_score=scale_score)
|
self._convert_es_hit_to_document(hit, adapt_score_for_embedding=True, scale_score=scale_score)
|
||||||
for hit in result
|
for hit in result
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if self.index_type == "hnsw":
|
||||||
|
ef_search = self._get_ef_search_value()
|
||||||
|
if top_k > ef_search:
|
||||||
|
logger.warning(
|
||||||
|
"top_k (%i) is greater than ef_search (%i). "
|
||||||
|
"We recommend setting ef_search >= top_k for optimal performance.",
|
||||||
|
top_k,
|
||||||
|
ef_search,
|
||||||
|
)
|
||||||
|
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
|
def query_by_embedding_batch(
|
||||||
|
self,
|
||||||
|
query_embs: Union[List[np.ndarray], np.ndarray],
|
||||||
|
filters: Optional[Union[FilterType, List[Optional[FilterType]]]] = None,
|
||||||
|
top_k: int = 10,
|
||||||
|
index: Optional[str] = None,
|
||||||
|
return_embedding: Optional[bool] = None,
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
|
scale_score: bool = True,
|
||||||
|
) -> List[List[Document]]:
|
||||||
|
"""
|
||||||
|
Find the documents that are most similar to the provided `query_embs` by using a vector similarity metric.
|
||||||
|
|
||||||
|
:param query_embs: Embeddings of the queries (e.g. gathered from DPR).
|
||||||
|
Can be a list of one-dimensional numpy arrays or a two-dimensional numpy array.
|
||||||
|
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||||
|
conditions.
|
||||||
|
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||||
|
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||||
|
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||||
|
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||||
|
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||||
|
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||||
|
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||||
|
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||||
|
operation.
|
||||||
|
|
||||||
|
__Example__:
|
||||||
|
```python
|
||||||
|
filters = {
|
||||||
|
"$and": {
|
||||||
|
"type": {"$eq": "article"},
|
||||||
|
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||||
|
"rating": {"$gte": 3},
|
||||||
|
"$or": {
|
||||||
|
"genre": {"$in": ["economy", "politics"]},
|
||||||
|
"publisher": {"$eq": "nytimes"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# or simpler using default operators
|
||||||
|
filters = {
|
||||||
|
"type": "article",
|
||||||
|
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||||
|
"rating": {"$gte": 3},
|
||||||
|
"$or": {
|
||||||
|
"genre": ["economy", "politics"],
|
||||||
|
"publisher": "nytimes"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
To use the same logical operator multiple times on the same level, logical operators take
|
||||||
|
optionally a list of dictionaries as value.
|
||||||
|
|
||||||
|
__Example__:
|
||||||
|
```python
|
||||||
|
filters = {
|
||||||
|
"$or": [
|
||||||
|
{
|
||||||
|
"$and": {
|
||||||
|
"Type": "News Paper",
|
||||||
|
"Date": {
|
||||||
|
"$lt": "2019-01-01"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$and": {
|
||||||
|
"Type": "Blog Post",
|
||||||
|
"Date": {
|
||||||
|
"$gte": "2019-01-01"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
:param top_k: How many documents to return
|
||||||
|
:param index: Index name for storing the docs and metadata
|
||||||
|
:param return_embedding: To return document embedding
|
||||||
|
:param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
|
||||||
|
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
|
||||||
|
:param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]).
|
||||||
|
If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
|
||||||
|
Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
if index is None:
|
||||||
|
index = self.index
|
||||||
|
|
||||||
|
if self.index_type in ["ivf", "ivf_pq"] and not self._ivf_model_exists(index=index):
|
||||||
|
self._ivf_index_not_trained_error(index=index, headers=headers)
|
||||||
|
|
||||||
|
return super().query_by_embedding_batch(
|
||||||
|
query_embs, filters, top_k, index, return_embedding, headers, scale_score
|
||||||
|
)
|
||||||
|
|
||||||
def _construct_dense_query_body(
|
def _construct_dense_query_body(
|
||||||
self, query_emb: np.ndarray, return_embedding: bool, filters: Optional[FilterType] = None, top_k: int = 10
|
self, query_emb: np.ndarray, return_embedding: bool, filters: Optional[FilterType] = None, top_k: int = 10
|
||||||
):
|
):
|
||||||
@ -509,8 +674,11 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
index_definition["settings"]["index"] = {"knn": True} # TODO: option to turn off for script scoring
|
index_definition["settings"]["index"] = {"knn": True} # TODO: option to turn off for script scoring
|
||||||
# global ef_search setting affects only nmslib, for faiss it is set in the field mapping
|
# global ef_search setting affects only nmslib, for faiss it is set in the field mapping
|
||||||
if self.knn_engine == "nmslib" and self.index_type == "hnsw":
|
if self.knn_engine == "nmslib" and self.index_type == "hnsw":
|
||||||
index_definition["settings"]["index"]["knn.algo_param.ef_search"] = 20
|
ef_search = self._get_ef_search_value()
|
||||||
index_definition["mappings"]["properties"][self.embedding_field] = self._get_embedding_field_mapping()
|
index_definition["settings"]["index"]["knn.algo_param.ef_search"] = ef_search
|
||||||
|
index_definition["mappings"]["properties"][self.embedding_field] = self._get_embedding_field_mapping(
|
||||||
|
index=index_name
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.client.indices.create(index=index_name, body=index_definition, headers=headers)
|
self.client.indices.create(index=index_name, body=index_definition, headers=headers)
|
||||||
@ -522,6 +690,68 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
if not self._index_exists(index_name, headers=headers):
|
if not self._index_exists(index_name, headers=headers):
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
def train_index(
|
||||||
|
self,
|
||||||
|
documents: Optional[Union[List[dict], List[Document]]] = None,
|
||||||
|
embeddings: Optional[np.ndarray] = None,
|
||||||
|
index: Optional[str] = None,
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Trains an IVF index on the provided Documents or embeddings if the index hasn't been trained yet.
|
||||||
|
|
||||||
|
The train vectors should come from the same distribution as your final vectors.
|
||||||
|
You can pass either Documents (including embeddings) or just plain embeddings you want to train the index on.
|
||||||
|
|
||||||
|
:param documents: Documents (including the embeddings) you want to train the index on.
|
||||||
|
:param embeddings: Plain embeddings you want to train the index on.
|
||||||
|
:param index: Name of the index to train. If `None`, the DocumentStore's default index (self.index) is used.
|
||||||
|
:param headers: Custom HTTP headers to pass to the OpenSearch client (for example {'Authorization': 'Basic YWRtaW46cm9vdA=='}).
|
||||||
|
For more information, see [HTTP/REST clients and security](https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html).
|
||||||
|
"""
|
||||||
|
if self.index_type not in ["ivf", "ivf_pq"]:
|
||||||
|
raise DocumentStoreError(
|
||||||
|
"You can only train an index if you set `index_type` to 'ivf' or 'ivf_pq' in your DocumentStore. "
|
||||||
|
"Other index types don't require training."
|
||||||
|
)
|
||||||
|
|
||||||
|
if index is None:
|
||||||
|
index = self.index
|
||||||
|
|
||||||
|
if isinstance(embeddings, np.ndarray) and documents:
|
||||||
|
raise ValueError("Pass either `documents` or `embeddings`. You passed both.")
|
||||||
|
|
||||||
|
if documents:
|
||||||
|
document_objects = [Document.from_dict(d) if isinstance(d, dict) else d for d in documents]
|
||||||
|
document_objects = [doc for doc in document_objects if doc.embedding is not None]
|
||||||
|
self._train_ivf_index(index=index, documents=document_objects, headers=headers)
|
||||||
|
elif isinstance(embeddings, np.ndarray):
|
||||||
|
document_objects = [
|
||||||
|
Document(content=f"Embedding {i}", embedding=embedding) for i, embedding in enumerate(embeddings)
|
||||||
|
]
|
||||||
|
self._train_ivf_index(index=index, documents=document_objects, headers=headers)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
"When calling `train_index`, you must provide either Documents or embeddings. "
|
||||||
|
"Because none of these values was provided, the index won't be trained."
|
||||||
|
)
|
||||||
|
|
||||||
|
def delete_index(self, index: str):
|
||||||
|
"""
|
||||||
|
Delete an existing search index. The index together with all data will be removed.
|
||||||
|
If the index is of type `"ivf"` or `"ivf_pq"`, this method also deletes the corresponding IVF and PQ model.
|
||||||
|
|
||||||
|
:param index: The name of the index to delete.
|
||||||
|
:return: None
|
||||||
|
"""
|
||||||
|
# Check if index uses an IVF model and delete it
|
||||||
|
index_mapping = self.client.indices.get(index)[index]["mappings"]["properties"]
|
||||||
|
if self.embedding_field in index_mapping and "model_id" in index_mapping[self.embedding_field]:
|
||||||
|
model_id = index_mapping[self.embedding_field]["model_id"]
|
||||||
|
self.client.transport.perform_request("DELETE", f"/_plugins/_knn/models/{model_id}")
|
||||||
|
|
||||||
|
super().delete_index(index)
|
||||||
|
|
||||||
def _validate_and_adjust_document_index(self, index_name: str, headers: Optional[Dict[str, str]] = None):
|
def _validate_and_adjust_document_index(self, index_name: str, headers: Optional[Dict[str, str]] = None):
|
||||||
"""
|
"""
|
||||||
Validates an existing document index. If there's no embedding field, we'll add it.
|
Validates an existing document index. If there's no embedding field, we'll add it.
|
||||||
@ -565,7 +795,7 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
|
|
||||||
if existing_embedding_field is None:
|
if existing_embedding_field is None:
|
||||||
# create embedding field
|
# create embedding field
|
||||||
mappings["properties"][self.embedding_field] = self._get_embedding_field_mapping()
|
mappings["properties"][self.embedding_field] = self._get_embedding_field_mapping(index=index_name)
|
||||||
self.client.indices.put_mapping(index=index_id, body=mappings, headers=headers)
|
self.client.indices.put_mapping(index=index_id, body=mappings, headers=headers)
|
||||||
else:
|
else:
|
||||||
# check type of existing embedding field
|
# check type of existing embedding field
|
||||||
@ -579,14 +809,16 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Check if existing embedding field fits desired knn settings
|
# Check if existing embedding field fits desired knn settings
|
||||||
if self.knn_engine != "score_script":
|
training_required = self.index_type in ["ivf", "ivf_pq"] and "model_id" not in existing_embedding_field
|
||||||
|
if self.knn_engine != "score_script" and not training_required:
|
||||||
self._validate_approximate_knn_settings(existing_embedding_field, index_settings, index_id)
|
self._validate_approximate_knn_settings(existing_embedding_field, index_settings, index_id)
|
||||||
|
|
||||||
# Adjust global ef_search setting (nmslib only). If not set, default is 512.
|
# Adjust global ef_search setting (nmslib only).
|
||||||
if self.knn_engine == "nmslib":
|
if self.knn_engine == "nmslib":
|
||||||
ef_search = index_settings.get("knn.algo_param", {}).get("ef_search", 512)
|
ef_search = index_settings.get("knn.algo_param", {}).get("ef_search", 512)
|
||||||
if self.index_type == "hnsw" and ef_search != 20:
|
desired_ef_search = self._get_ef_search_value()
|
||||||
body = {"knn.algo_param.ef_search": 20}
|
if self.index_type == "hnsw" and ef_search != desired_ef_search:
|
||||||
|
body = {"knn.algo_param.ef_search": desired_ef_search}
|
||||||
self.client.indices.put_settings(index=index_id, body=body, headers=headers)
|
self.client.indices.put_settings(index=index_id, body=body, headers=headers)
|
||||||
logger.info("Set ef_search to 20 for hnsw index '%s'.", index_id)
|
logger.info("Set ef_search to 20 for hnsw index '%s'.", index_id)
|
||||||
elif self.index_type == "flat" and ef_search != 512:
|
elif self.index_type == "flat" and ef_search != 512:
|
||||||
@ -603,19 +835,13 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
If settings are not specified we infer the same default values as https://opensearch.org/docs/latest/search-plugins/knn/knn-index/
|
If settings are not specified we infer the same default values as https://opensearch.org/docs/latest/search-plugins/knn/knn-index/
|
||||||
"""
|
"""
|
||||||
method = existing_embedding_field.get("method", {})
|
method = existing_embedding_field.get("method", {})
|
||||||
embedding_field_space_type = method.get("space_type", "l2")
|
if "model_id" in existing_embedding_field:
|
||||||
embedding_field_knn_engine = method.get("engine", "nmslib")
|
embedding_field_knn_engine = "faiss"
|
||||||
embedding_field_method_name = method.get("name", "hnsw")
|
|
||||||
|
|
||||||
parameters = method.get("parameters", {})
|
|
||||||
embedding_field_ef_construction = parameters.get("ef_construction", 512)
|
|
||||||
embedding_field_m = parameters.get("m", 16)
|
|
||||||
# ef_search is configured in the index settings and not in the mapping for nmslib
|
|
||||||
if embedding_field_knn_engine == "nmslib":
|
|
||||||
embedding_field_ef_search = index_settings.get("knn.algo_param", {}).get("ef_search", 512)
|
|
||||||
else:
|
else:
|
||||||
embedding_field_ef_search = parameters.get("ef_search", 512)
|
embedding_field_knn_engine = method.get("engine", "nmslib")
|
||||||
|
embedding_field_space_type = method.get("space_type", "l2")
|
||||||
|
|
||||||
|
# Validate knn engine
|
||||||
if embedding_field_knn_engine != self.knn_engine:
|
if embedding_field_knn_engine != self.knn_engine:
|
||||||
raise DocumentStoreError(
|
raise DocumentStoreError(
|
||||||
f"Existing embedding field '{self.embedding_field}' of OpenSearch index '{index_id}' has knn_engine "
|
f"Existing embedding field '{self.embedding_field}' of OpenSearch index '{index_id}' has knn_engine "
|
||||||
@ -626,6 +852,7 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
f" - Overwrite the existing index by setting `recreate_index=True`. Note that you'll lose all existing data."
|
f" - Overwrite the existing index by setting `recreate_index=True`. Note that you'll lose all existing data."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Validate space type
|
||||||
if embedding_field_space_type != self.space_type:
|
if embedding_field_space_type != self.space_type:
|
||||||
supported_similaries = [
|
supported_similaries = [
|
||||||
k
|
k
|
||||||
@ -646,7 +873,32 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
f" - Overwrite the existing index by setting `recreate_index=True`. Note that you'll lose all existing data."
|
f" - Overwrite the existing index by setting `recreate_index=True`. Note that you'll lose all existing data."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Validate HNSW indices
|
||||||
|
if self.index_type in ["flat", "hnsw"]:
|
||||||
|
self._validate_hnsw_settings(existing_embedding_field, index_settings, index_id)
|
||||||
|
# Validate IVF indices
|
||||||
|
elif self.index_type in ["ivf", "ivf_pq"]:
|
||||||
|
self._validate_ivf_settings(existing_embedding_field, index_settings, index_id)
|
||||||
|
else:
|
||||||
|
raise DocumentStoreError("Unknown index_type. Must be one of 'flat', 'hnsw', 'ivf', or 'ivf_pq'.")
|
||||||
|
|
||||||
|
def _validate_hnsw_settings(
|
||||||
|
self, existing_embedding_field: Dict[str, Any], index_settings: Dict[str, Any], index_id: str
|
||||||
|
):
|
||||||
|
method = existing_embedding_field.get("method", {})
|
||||||
|
parameters = method.get("parameters", {})
|
||||||
|
embedding_field_method_name = method.get("name", "hnsw")
|
||||||
|
embedding_field_ef_construction = parameters.get("ef_construction", 512)
|
||||||
|
embedding_field_m = parameters.get("m", 16)
|
||||||
|
embedding_field_knn_engine = method.get("engine", "nmslib")
|
||||||
|
# ef_search is configured in the index settings and not in the mapping for nmslib
|
||||||
|
if embedding_field_knn_engine == "nmslib":
|
||||||
|
embedding_field_ef_search = index_settings.get("knn.algo_param", {}).get("ef_search", 512)
|
||||||
|
else:
|
||||||
|
embedding_field_ef_search = parameters.get("ef_search", 512)
|
||||||
|
|
||||||
# Check method params according to requested index_type
|
# Check method params according to requested index_type
|
||||||
|
# Indices of type "flat" that don't use "score_script" as knn_engine use an HNSW index optimized for accuracy
|
||||||
if self.index_type == "flat":
|
if self.index_type == "flat":
|
||||||
self._assert_embedding_param(
|
self._assert_embedding_param(
|
||||||
name="method.name", actual=embedding_field_method_name, expected="hnsw", index_id=index_id
|
name="method.name", actual=embedding_field_method_name, expected="hnsw", index_id=index_id
|
||||||
@ -659,17 +911,54 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
self._assert_embedding_param(
|
self._assert_embedding_param(
|
||||||
name="ef_search", actual=embedding_field_ef_search, expected=512, index_id=index_id
|
name="ef_search", actual=embedding_field_ef_search, expected=512, index_id=index_id
|
||||||
)
|
)
|
||||||
if self.index_type == "hnsw":
|
|
||||||
|
elif self.index_type == "hnsw":
|
||||||
|
expected_ef_construction = self.knn_parameters.get("ef_construction", 80)
|
||||||
|
expected_m = self.knn_parameters.get("m", 64)
|
||||||
|
expected_ef_search = self.knn_parameters.get("ef_search", 20)
|
||||||
self._assert_embedding_param(
|
self._assert_embedding_param(
|
||||||
name="method.name", actual=embedding_field_method_name, expected="hnsw", index_id=index_id
|
name="method.name", actual=embedding_field_method_name, expected="hnsw", index_id=index_id
|
||||||
)
|
)
|
||||||
self._assert_embedding_param(
|
self._assert_embedding_param(
|
||||||
name="ef_construction", actual=embedding_field_ef_construction, expected=80, index_id=index_id
|
name="ef_construction",
|
||||||
|
actual=embedding_field_ef_construction,
|
||||||
|
expected=expected_ef_construction,
|
||||||
|
index_id=index_id,
|
||||||
)
|
)
|
||||||
self._assert_embedding_param(name="m", actual=embedding_field_m, expected=64, index_id=index_id)
|
self._assert_embedding_param(name="m", actual=embedding_field_m, expected=expected_m, index_id=index_id)
|
||||||
if self.knn_engine == "faiss":
|
if self.knn_engine == "faiss":
|
||||||
self._assert_embedding_param(
|
self._assert_embedding_param(
|
||||||
name="ef_search", actual=embedding_field_ef_search, expected=20, index_id=index_id
|
name="ef_search", actual=embedding_field_ef_search, expected=expected_ef_search, index_id=index_id
|
||||||
|
)
|
||||||
|
|
||||||
|
def _validate_ivf_settings(
|
||||||
|
self, existing_embedding_field: Dict[str, Any], index_settings: Dict[str, Any], index_id: str
|
||||||
|
):
|
||||||
|
# Index is not trained yet and should therefore be an HNSW index with default settings until index is trained
|
||||||
|
if "model_id" in existing_embedding_field:
|
||||||
|
model_endpoint = f"/_plugins/_knn/models/{existing_embedding_field['model_id']}"
|
||||||
|
response = self.client.transport.perform_request("GET", url=model_endpoint)
|
||||||
|
model_settings_list = [setting.split(":") for setting in response["description"].split()]
|
||||||
|
model_settings = {k: (int(v) if v.isnumeric() else v) for k, v in model_settings_list}
|
||||||
|
|
||||||
|
embedding_field_nlist = model_settings.get("nlist")
|
||||||
|
embedding_field_nprobes = model_settings.get("nprobes")
|
||||||
|
expected_nlist = self.knn_parameters.get("nlist", 4)
|
||||||
|
expected_nprobes = self.knn_parameters.get("nprobes", 1)
|
||||||
|
self._assert_embedding_param(
|
||||||
|
name="nlist", actual=embedding_field_nlist, expected=expected_nlist, index_id=index_id
|
||||||
|
)
|
||||||
|
self._assert_embedding_param(
|
||||||
|
name="nprobes", actual=embedding_field_nprobes, expected=expected_nprobes, index_id=index_id
|
||||||
|
)
|
||||||
|
if self.index_type == "ivf_pq":
|
||||||
|
embedding_field_m = model_settings.get("m")
|
||||||
|
embedding_field_code_size = model_settings.get("code_size")
|
||||||
|
expected_m = self.knn_parameters.get("m", 1)
|
||||||
|
expected_code_size = self.knn_parameters.get("code_size", 8)
|
||||||
|
self._assert_embedding_param(name="m", actual=embedding_field_m, expected=expected_m, index_id=index_id)
|
||||||
|
self._assert_embedding_param(
|
||||||
|
name="code_size", actual=embedding_field_code_size, expected=expected_code_size, index_id=index_id
|
||||||
)
|
)
|
||||||
|
|
||||||
def _assert_embedding_param(self, name: str, actual: Any, expected: Any, index_id: str) -> None:
|
def _assert_embedding_param(self, name: str, actual: Any, expected: Any, index_id: str) -> None:
|
||||||
@ -690,6 +979,7 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
space_type: Optional[str] = None,
|
space_type: Optional[str] = None,
|
||||||
index_type: Optional[str] = None,
|
index_type: Optional[str] = None,
|
||||||
embedding_dim: Optional[int] = None,
|
embedding_dim: Optional[int] = None,
|
||||||
|
index: Optional[str] = None,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
if space_type is None:
|
if space_type is None:
|
||||||
space_type = self.space_type
|
space_type = self.space_type
|
||||||
@ -699,27 +989,78 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
index_type = self.index_type
|
index_type = self.index_type
|
||||||
if embedding_dim is None:
|
if embedding_dim is None:
|
||||||
embedding_dim = self.embedding_dim
|
embedding_dim = self.embedding_dim
|
||||||
|
if index is None:
|
||||||
|
index = self.index
|
||||||
|
|
||||||
embeddings_field_mapping = {"type": "knn_vector", "dimension": embedding_dim}
|
embeddings_field_mapping = {"type": "knn_vector", "dimension": embedding_dim}
|
||||||
if knn_engine != "score_script":
|
if knn_engine != "score_script":
|
||||||
method: dict = {"space_type": space_type, "name": "hnsw", "engine": knn_engine}
|
method: dict = {"space_type": space_type, "engine": knn_engine}
|
||||||
|
|
||||||
|
ef_construction = (
|
||||||
|
80 if "ef_construction" not in self.knn_parameters else self.knn_parameters["ef_construction"]
|
||||||
|
)
|
||||||
|
ef_search = self._get_ef_search_value()
|
||||||
|
m = 64 if "m" not in self.knn_parameters else self.knn_parameters["m"]
|
||||||
|
|
||||||
if index_type == "flat":
|
if index_type == "flat":
|
||||||
|
# We're using HNSW with knn_engines nmslib and faiss as they do not support exact knn.
|
||||||
|
method["name"] = "hnsw"
|
||||||
# use default parameters from https://opensearch.org/docs/1.2/search-plugins/knn/knn-index/
|
# use default parameters from https://opensearch.org/docs/1.2/search-plugins/knn/knn-index/
|
||||||
# we need to set them explicitly as aws managed instances starting from version 1.2 do not support empty parameters
|
# we need to set them explicitly as aws managed instances starting from version 1.2 do not support empty parameters
|
||||||
method["parameters"] = {"ef_construction": 512, "m": 16}
|
method["parameters"] = {"ef_construction": 512, "m": 16}
|
||||||
elif index_type == "hnsw":
|
elif index_type == "hnsw":
|
||||||
method["parameters"] = {"ef_construction": 80, "m": 64}
|
method["name"] = "hnsw"
|
||||||
|
method["parameters"] = {"ef_construction": ef_construction, "m": m}
|
||||||
# for nmslib this is a global index setting
|
# for nmslib this is a global index setting
|
||||||
if knn_engine == "faiss":
|
if knn_engine == "faiss":
|
||||||
method["parameters"]["ef_search"] = 20
|
method["parameters"]["ef_search"] = ef_search
|
||||||
|
elif index_type in ["ivf", "ivf_pq"]:
|
||||||
|
if knn_engine != "faiss":
|
||||||
|
raise DocumentStoreError("To use 'ivf' or 'ivf_pq as index_type, set knn_engine to 'faiss'.")
|
||||||
|
# Check if IVF model already exists
|
||||||
|
if self._ivf_model_exists(index):
|
||||||
|
logger.info("Using existing IVF model '%s-ivf' for index '%s'.", index, index)
|
||||||
|
embeddings_field_mapping = {"type": "knn_vector", "model_id": f"{index}-ivf"}
|
||||||
|
method = {}
|
||||||
|
else:
|
||||||
|
# IVF indices require training before they can be initialized. Setting index_type to HNSW until
|
||||||
|
# index is trained
|
||||||
|
logger.info("Using index of type 'flat' for index '%s' until IVF model is trained.", index)
|
||||||
|
method = {}
|
||||||
else:
|
else:
|
||||||
logger.error("Set index_type to either 'flat' or 'hnsw'")
|
logger.error("Set index_type to either 'flat', 'hnsw', 'ivf', or 'ivf_pq'.")
|
||||||
|
method["name"] = "hnsw"
|
||||||
|
|
||||||
embeddings_field_mapping["method"] = method
|
if method:
|
||||||
|
embeddings_field_mapping["method"] = method
|
||||||
|
|
||||||
return embeddings_field_mapping
|
return embeddings_field_mapping
|
||||||
|
|
||||||
|
def _ivf_model_exists(self, index: str) -> bool:
|
||||||
|
if self._index_exists(".opensearch-knn-models"):
|
||||||
|
response = self.client.transport.perform_request("GET", "/_plugins/_knn/models/_search")
|
||||||
|
existing_ivf_models = set(
|
||||||
|
model["_source"]["model_id"]
|
||||||
|
for model in response["hits"]["hits"]
|
||||||
|
if model["_source"]["state"] != "failed"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
existing_ivf_models = set()
|
||||||
|
|
||||||
|
return f"{index}-ivf" in existing_ivf_models
|
||||||
|
|
||||||
|
def _ivf_index_not_trained_error(self, index: str, headers: Optional[Dict[str, str]] = None):
|
||||||
|
add_num_of_embs = ""
|
||||||
|
if self.ivf_train_size != 0:
|
||||||
|
embs_to_add = self.get_embedding_count(index=index, headers=headers) - self.ivf_train_size
|
||||||
|
add_num_of_embs = (
|
||||||
|
f"or add at least {embs_to_add} more embeddings to automatically start the " f"training process "
|
||||||
|
)
|
||||||
|
raise DocumentStoreError(
|
||||||
|
f"Index of type '{self.index_type}' is not trained yet. Train the index manually using "
|
||||||
|
f"`train_index` {add_num_of_embs}before querying it."
|
||||||
|
)
|
||||||
|
|
||||||
def _create_label_index(self, index_name: str, headers: Optional[Dict[str, str]] = None):
|
def _create_label_index(self, index_name: str, headers: Optional[Dict[str, str]] = None):
|
||||||
mapping = {
|
mapping = {
|
||||||
"mappings": {
|
"mappings": {
|
||||||
@ -798,6 +1139,156 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore):
|
|||||||
|
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
def _train_ivf_index(
|
||||||
|
self, index: Optional[str], documents: List[Document], headers: Optional[Dict[str, str]] = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
If the provided index is not an IVF index yet, this method trains it on the provided Documents
|
||||||
|
and converts the index to an IVF index.
|
||||||
|
"""
|
||||||
|
if index is None:
|
||||||
|
index = self.index
|
||||||
|
|
||||||
|
# Check if IVF index is already trained by checking if embedding mapping contains a model_id field
|
||||||
|
if "model_id" in self.client.indices.get(index)[index]["mappings"]["properties"][self.embedding_field]:
|
||||||
|
logger.info("IVF index '%s' is already trained. Skipping training.", index)
|
||||||
|
# IVF model is not trained yet -> train it and convert HNSW index to IVF index
|
||||||
|
else:
|
||||||
|
nlist = self.knn_parameters.get("nlist", 4)
|
||||||
|
nprobes = self.knn_parameters.get("nprobes", 1)
|
||||||
|
recommended_train_size = self._recommended_ivf_train_size()
|
||||||
|
documents = [doc for doc in documents if doc.embedding is not None]
|
||||||
|
if len(documents) < nlist:
|
||||||
|
raise DocumentStoreError(
|
||||||
|
f"IVF training requires the number of training samples to be greater than or "
|
||||||
|
f"equal to `nlist`. Number of provided training samples is `{len(documents)}` "
|
||||||
|
f"and nlist is `{nlist}`."
|
||||||
|
)
|
||||||
|
if len(documents) < recommended_train_size:
|
||||||
|
logger.warning(
|
||||||
|
"Consider increasing the number of training samples to at least "
|
||||||
|
"`%i` to get a reliable %s index.",
|
||||||
|
recommended_train_size,
|
||||||
|
self.index_type,
|
||||||
|
)
|
||||||
|
# Create temporary index containing training embeddings
|
||||||
|
self._create_document_index(index_name=f".{index}_ivf_training", headers=headers)
|
||||||
|
self.write_documents(documents=documents, index=f".{index}_ivf_training", headers=headers)
|
||||||
|
|
||||||
|
settings = f"index_type:{self.index_type} nlist:{nlist} nprobes:{nprobes}"
|
||||||
|
training_req_body: Dict = {
|
||||||
|
"training_index": f".{index}_ivf_training",
|
||||||
|
"training_field": self.embedding_field,
|
||||||
|
"dimension": self.embedding_dim,
|
||||||
|
"method": {
|
||||||
|
"name": "ivf",
|
||||||
|
"engine": "faiss",
|
||||||
|
"space_type": self.space_type,
|
||||||
|
"parameters": {"nlist": nlist, "nprobes": nprobes},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
# Add product quantization
|
||||||
|
if self.index_type == "ivf_pq":
|
||||||
|
m = self.knn_parameters.get("m", 1)
|
||||||
|
code_size = self.knn_parameters.get("code_size", 8)
|
||||||
|
if code_size > 8:
|
||||||
|
raise DocumentStoreError(
|
||||||
|
f"code_size parameter for product quantization must be less than or equal to 8. "
|
||||||
|
f"Provided code_size is `{code_size}`."
|
||||||
|
)
|
||||||
|
|
||||||
|
# see FAISS doc for details: https://github.com/facebookresearch/faiss/wiki/FAQ#can-i-ignore-warning-clustering-xxx-points-to-yyy-centroids
|
||||||
|
n_clusters = 2**code_size
|
||||||
|
if len(documents) < n_clusters:
|
||||||
|
raise DocumentStoreError(
|
||||||
|
f"PQ training requires the number of training samples to be greater than or "
|
||||||
|
f"equal to the number of clusters. Number of provided training samples is `{len(documents)}` "
|
||||||
|
f"and the number of clusters is `{n_clusters}`."
|
||||||
|
)
|
||||||
|
|
||||||
|
encoder = {"name": "pq", "parameters": {"m": m, "code_size": code_size}}
|
||||||
|
settings += f" m:{m} code_size:{code_size}"
|
||||||
|
training_req_body["method"]["parameters"]["encoder"] = encoder
|
||||||
|
|
||||||
|
training_req_body["description"] = settings
|
||||||
|
logger.info("Training IVF index '%s' using {len(documents)} embeddings.", index)
|
||||||
|
train_endpoint = f"/_plugins/_knn/models/{index}-ivf/_train"
|
||||||
|
response = self.client.transport.perform_request(
|
||||||
|
"POST", url=train_endpoint, headers=headers, body=training_req_body
|
||||||
|
)
|
||||||
|
ivf_model = response["model_id"]
|
||||||
|
|
||||||
|
# Wait until model training is finished, _knn_model_trained uses a retry decorator
|
||||||
|
if self._knn_model_trained(ivf_model, headers=headers):
|
||||||
|
logger.info("Training of IVF index '%s' finished.", index)
|
||||||
|
|
||||||
|
# Delete temporary training index
|
||||||
|
self.client.indices.delete(index=f".{index}_ivf_training", headers=headers)
|
||||||
|
|
||||||
|
# Clone original index to temporary one
|
||||||
|
self.client.indices.add_block(index=index, block="read_only")
|
||||||
|
self.client.indices.clone(
|
||||||
|
index=index, target=f".{index}_temp", body={"settings": {"index": {"blocks": {"read_only": False}}}}
|
||||||
|
)
|
||||||
|
self.client.indices.put_settings(index=index, body={"index": {"blocks": {"read_only": False}}})
|
||||||
|
self.client.indices.delete(index=index)
|
||||||
|
|
||||||
|
# Reindex original index to newly created IVF index
|
||||||
|
self._create_document_index(index_name=index, headers=headers)
|
||||||
|
self.client.reindex(
|
||||||
|
body={"source": {"index": f".{index}_temp"}, "dest": {"index": index}},
|
||||||
|
params={"request_timeout": 24 * 60 * 60},
|
||||||
|
)
|
||||||
|
self.client.indices.delete(index=f".{index}_temp")
|
||||||
|
|
||||||
|
def _recommended_ivf_train_size(self) -> int:
|
||||||
|
"""
|
||||||
|
Calculates the minumum recommended number of training samples for IVF training as suggested in FAISS docs.
|
||||||
|
https://github.com/facebookresearch/faiss/wiki/FAQ#can-i-ignore-warning-clustering-xxx-points-to-yyy-centroids
|
||||||
|
"""
|
||||||
|
min_points_per_cluster = 39
|
||||||
|
if self.index_type == "ivf":
|
||||||
|
n_clusters = self.knn_parameters.get("nlist", 4)
|
||||||
|
return n_clusters * min_points_per_cluster
|
||||||
|
elif self.index_type == "ivf_pq":
|
||||||
|
n_clusters = 2 ** self.knn_parameters.get("code_size", 8)
|
||||||
|
return n_clusters * min_points_per_cluster
|
||||||
|
else:
|
||||||
|
raise DocumentStoreError(f"Invalid index type '{self.index_type}'.")
|
||||||
|
|
||||||
|
@retry(retry=retry_if_not_result(bool), wait=wait_exponential(min=1, max=10))
|
||||||
|
def _knn_model_trained(self, model_name: str, headers: Optional[Dict[str, str]] = None) -> bool:
|
||||||
|
model_state_endpoint = f"/_plugins/_knn/models/{model_name}"
|
||||||
|
response = self.client.transport.perform_request("GET", url=model_state_endpoint, headers=headers)
|
||||||
|
model_state = response["state"]
|
||||||
|
if model_state == "created":
|
||||||
|
return True
|
||||||
|
elif model_state == "failed":
|
||||||
|
error_message = response["error"]
|
||||||
|
raise DocumentStoreError(f"Failed to train the KNN model. Error message: {error_message}")
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _get_ef_search_value(self) -> int:
|
||||||
|
ef_search = 20 if "ef_search" not in self.knn_parameters else self.knn_parameters["ef_search"]
|
||||||
|
return ef_search
|
||||||
|
|
||||||
|
def _delete_index(self, index: str):
|
||||||
|
if self._index_exists(index):
|
||||||
|
self.client.indices.delete(index=index, ignore=[400, 404])
|
||||||
|
self._delete_ivf_model(index)
|
||||||
|
logger.info("Index '%s' deleted.", index)
|
||||||
|
|
||||||
|
def _delete_ivf_model(self, index: str):
|
||||||
|
"""
|
||||||
|
If index is an index of type 'ivf' or 'ivf_pq', this method deletes the corresponding IVF model.
|
||||||
|
"""
|
||||||
|
if self._index_exists(".opensearch-knn-models"):
|
||||||
|
response = self.client.transport.perform_request("GET", "/_plugins/_knn/models/_search")
|
||||||
|
existing_ivf_models = set(model["_source"]["model_id"] for model in response["hits"]["hits"])
|
||||||
|
if f"{index}-ivf" in existing_ivf_models:
|
||||||
|
self.client.transport.perform_request("DELETE", f"/_plugins/_knn/models/{index}-ivf")
|
||||||
|
|
||||||
def clone_embedding_field(
|
def clone_embedding_field(
|
||||||
self,
|
self,
|
||||||
new_embedding_field: str,
|
new_embedding_field: str,
|
||||||
|
|||||||
@ -66,7 +66,6 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
|
|||||||
similarity: str = "dot_product",
|
similarity: str = "dot_product",
|
||||||
return_embedding: bool = False,
|
return_embedding: bool = False,
|
||||||
duplicate_documents: str = "overwrite",
|
duplicate_documents: str = "overwrite",
|
||||||
index_type: str = "flat",
|
|
||||||
scroll: str = "1d",
|
scroll: str = "1d",
|
||||||
skip_missing_embeddings: bool = True,
|
skip_missing_embeddings: bool = True,
|
||||||
synonyms: Optional[List] = None,
|
synonyms: Optional[List] = None,
|
||||||
@ -105,10 +104,6 @@ class SearchEngineDocumentStore(KeywordDocumentStore):
|
|||||||
raise DocumentStoreError(
|
raise DocumentStoreError(
|
||||||
f"Invalid value {similarity} for similarity, choose between 'cosine', 'l2' and 'dot_product'"
|
f"Invalid value {similarity} for similarity, choose between 'cosine', 'l2' and 'dot_product'"
|
||||||
)
|
)
|
||||||
if index_type in ["flat", "hnsw"]:
|
|
||||||
self.index_type = index_type
|
|
||||||
else:
|
|
||||||
raise Exception("Invalid value for index_type in constructor. Choose between 'flat' and 'hnsw'")
|
|
||||||
|
|
||||||
self._init_indices(
|
self._init_indices(
|
||||||
index=index, label_index=label_index, create_index=create_index, recreate_index=recreate_index
|
index=index, label_index=label_index, create_index=create_index, recreate_index=recreate_index
|
||||||
|
|||||||
@ -66,6 +66,7 @@ dependencies = [
|
|||||||
"azure-ai-formrecognizer>=3.2.0b2", # forms reader
|
"azure-ai-formrecognizer>=3.2.0b2", # forms reader
|
||||||
# audio's espnet-model-zoo requires huggingface-hub version <0.8 while we need >=0.5 to be able to use create_repo in FARMReader
|
# audio's espnet-model-zoo requires huggingface-hub version <0.8 while we need >=0.5 to be able to use create_repo in FARMReader
|
||||||
"huggingface-hub>=0.5.0",
|
"huggingface-hub>=0.5.0",
|
||||||
|
"tenacity", # retry decorator
|
||||||
|
|
||||||
# Preprocessing
|
# Preprocessing
|
||||||
"more_itertools", # for windowing
|
"more_itertools", # for windowing
|
||||||
|
|||||||
@ -136,6 +136,35 @@ class TestFAISSDocumentStore(DocumentStoreBaseTestAbstract):
|
|||||||
# Check that get_embedding_count works as expected
|
# Check that get_embedding_count works as expected
|
||||||
assert document_store.get_embedding_count() == len(documents_with_embeddings)
|
assert document_store.get_embedding_count() == len(documents_with_embeddings)
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_train_index_from_docs(self, documents_with_embeddings, tmp_path):
|
||||||
|
document_store = FAISSDocumentStore(
|
||||||
|
sql_url=f"sqlite:///{tmp_path}/test_faiss_retrieving.db",
|
||||||
|
faiss_index_factory_str="IVF1,Flat",
|
||||||
|
isolation_level="AUTOCOMMIT",
|
||||||
|
return_embedding=True,
|
||||||
|
)
|
||||||
|
document_store.delete_all_documents(index=document_store.index)
|
||||||
|
|
||||||
|
assert not document_store.faiss_indexes[document_store.index].is_trained
|
||||||
|
document_store.train_index(documents_with_embeddings)
|
||||||
|
assert document_store.faiss_indexes[document_store.index].is_trained
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
def test_train_index_from_embeddings(self, documents_with_embeddings, tmp_path):
|
||||||
|
document_store = FAISSDocumentStore(
|
||||||
|
sql_url=f"sqlite:///{tmp_path}/test_faiss_retrieving.db",
|
||||||
|
faiss_index_factory_str="IVF1,Flat",
|
||||||
|
isolation_level="AUTOCOMMIT",
|
||||||
|
return_embedding=True,
|
||||||
|
)
|
||||||
|
document_store.delete_all_documents(index=document_store.index)
|
||||||
|
|
||||||
|
embeddings = np.array([doc.embedding for doc in documents_with_embeddings])
|
||||||
|
assert not document_store.faiss_indexes[document_store.index].is_trained
|
||||||
|
document_store.train_index(embeddings=embeddings)
|
||||||
|
assert document_store.faiss_indexes[document_store.index].is_trained
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test_write_docs_different_indexes(self, ds, documents_with_embeddings):
|
def test_write_docs_different_indexes(self, ds, documents_with_embeddings):
|
||||||
docs_a = documents_with_embeddings[:2]
|
docs_a = documents_with_embeddings[:2]
|
||||||
|
|||||||
@ -127,8 +127,11 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc
|
|||||||
OpenSearchDocumentStore(index="nmslib_index", create_index=True)
|
OpenSearchDocumentStore(index="nmslib_index", create_index=True)
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test___init___faiss(self):
|
@pytest.mark.parametrize("index_type", ["flat", "hnsw", "ivf", "ivf_pq"])
|
||||||
OpenSearchDocumentStore(index="faiss_index", create_index=True, knn_engine="faiss")
|
def test___init___faiss(self, index_type):
|
||||||
|
OpenSearchDocumentStore(
|
||||||
|
index=f"faiss_index_{index_type}", recreate_index=True, knn_engine="faiss", index_type=index_type
|
||||||
|
)
|
||||||
|
|
||||||
@pytest.mark.integration
|
@pytest.mark.integration
|
||||||
def test___init___score_script(self):
|
def test___init___score_script(self):
|
||||||
@ -185,6 +188,107 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc
|
|||||||
for result in results:
|
for result in results:
|
||||||
assert len(result) == 3
|
assert len(result) == 3
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
@pytest.mark.parametrize("index_type", ["ivf", "ivf_pq"])
|
||||||
|
def test_train_index_from_documents(self, ds: OpenSearchDocumentStore, documents, index_type):
|
||||||
|
# Create another document store on top of the previous one
|
||||||
|
ds = OpenSearchDocumentStore(
|
||||||
|
index=ds.index,
|
||||||
|
label_index=ds.label_index,
|
||||||
|
recreate_index=True,
|
||||||
|
knn_engine="faiss",
|
||||||
|
index_type=index_type,
|
||||||
|
knn_parameters={"code_size": 2},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check that IVF indices use score_script before training
|
||||||
|
emb_field_settings = ds.client.indices.get(ds.index)[ds.index]["mappings"]["properties"][ds.embedding_field]
|
||||||
|
assert emb_field_settings == {"type": "knn_vector", "dimension": 768}
|
||||||
|
|
||||||
|
ds.train_index(documents)
|
||||||
|
# Check that embedding_field_settings have been updated
|
||||||
|
emb_field_settings = ds.client.indices.get(ds.index)[ds.index]["mappings"]["properties"][ds.embedding_field]
|
||||||
|
assert emb_field_settings == {"type": "knn_vector", "model_id": f"{ds.index}-ivf"}
|
||||||
|
|
||||||
|
# Check that model uses expected parameters
|
||||||
|
expected_model_settigns = {"index_type": index_type, "nlist": 4, "nprobes": 1}
|
||||||
|
if index_type == "ivf_pq":
|
||||||
|
expected_model_settigns["code_size"] = 2
|
||||||
|
expected_model_settigns["m"] = 1
|
||||||
|
model_endpoint = f"/_plugins/_knn/models/{ds.index}-ivf"
|
||||||
|
response = ds.client.transport.perform_request("GET", url=model_endpoint)
|
||||||
|
model_settings_list = [setting.split(":") for setting in response["description"].split()]
|
||||||
|
model_settings = {k: (int(v) if v.isnumeric() else v) for k, v in model_settings_list}
|
||||||
|
assert model_settings == expected_model_settigns
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
@pytest.mark.parametrize("index_type", ["ivf", "ivf_pq"])
|
||||||
|
def test_train_index_from_embeddings(self, ds: OpenSearchDocumentStore, documents, index_type):
|
||||||
|
# Create another document store on top of the previous one
|
||||||
|
ds = OpenSearchDocumentStore(
|
||||||
|
index=ds.index,
|
||||||
|
label_index=ds.label_index,
|
||||||
|
recreate_index=True,
|
||||||
|
knn_engine="faiss",
|
||||||
|
index_type=index_type,
|
||||||
|
knn_parameters={"code_size": 2},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check that IVF indices use HNSW with default settings before training
|
||||||
|
emb_field_settings = ds.client.indices.get(ds.index)[ds.index]["mappings"]["properties"][ds.embedding_field]
|
||||||
|
assert emb_field_settings == {"type": "knn_vector", "dimension": 768}
|
||||||
|
|
||||||
|
embeddings = np.array([doc.embedding for doc in documents if doc.embedding is not None])
|
||||||
|
ds.train_index(embeddings=embeddings)
|
||||||
|
# Check that embedding_field_settings have been updated
|
||||||
|
emb_field_settings = ds.client.indices.get(ds.index)[ds.index]["mappings"]["properties"][ds.embedding_field]
|
||||||
|
assert emb_field_settings == {"type": "knn_vector", "model_id": f"{ds.index}-ivf"}
|
||||||
|
|
||||||
|
# Check that model uses expected parameters
|
||||||
|
expected_model_settigns = {"index_type": index_type, "nlist": 4, "nprobes": 1}
|
||||||
|
if index_type == "ivf_pq":
|
||||||
|
expected_model_settigns["code_size"] = 2
|
||||||
|
expected_model_settigns["m"] = 1
|
||||||
|
model_endpoint = f"/_plugins/_knn/models/{ds.index}-ivf"
|
||||||
|
response = ds.client.transport.perform_request("GET", url=model_endpoint)
|
||||||
|
model_settings_list = [setting.split(":") for setting in response["description"].split()]
|
||||||
|
model_settings = {k: (int(v) if v.isnumeric() else v) for k, v in model_settings_list}
|
||||||
|
assert model_settings == expected_model_settigns
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
@pytest.mark.parametrize("index_type", ["ivf", "ivf_pq"])
|
||||||
|
def test_train_index_with_write_documents(self, ds: OpenSearchDocumentStore, documents, index_type):
|
||||||
|
# Create another document store on top of the previous one
|
||||||
|
ds = OpenSearchDocumentStore(
|
||||||
|
index=ds.index,
|
||||||
|
label_index=ds.label_index,
|
||||||
|
recreate_index=True,
|
||||||
|
knn_engine="faiss",
|
||||||
|
index_type=index_type,
|
||||||
|
knn_parameters={"code_size": 2},
|
||||||
|
ivf_train_size=6,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check that IVF indices use HNSW with default settings before training
|
||||||
|
emb_field_settings = ds.client.indices.get(ds.index)[ds.index]["mappings"]["properties"][ds.embedding_field]
|
||||||
|
assert emb_field_settings == {"type": "knn_vector", "dimension": 768}
|
||||||
|
|
||||||
|
ds.write_documents(documents)
|
||||||
|
# Check that embedding_field_settings have been updated
|
||||||
|
emb_field_settings = ds.client.indices.get(ds.index)[ds.index]["mappings"]["properties"][ds.embedding_field]
|
||||||
|
assert emb_field_settings == {"type": "knn_vector", "model_id": f"{ds.index}-ivf"}
|
||||||
|
|
||||||
|
# Check that model uses expected parameters
|
||||||
|
expected_model_settigns = {"index_type": index_type, "nlist": 4, "nprobes": 1}
|
||||||
|
if index_type == "ivf_pq":
|
||||||
|
expected_model_settigns["code_size"] = 2
|
||||||
|
expected_model_settigns["m"] = 1
|
||||||
|
model_endpoint = f"/_plugins/_knn/models/{ds.index}-ivf"
|
||||||
|
response = ds.client.transport.perform_request("GET", url=model_endpoint)
|
||||||
|
model_settings_list = [setting.split(":") for setting in response["description"].split()]
|
||||||
|
model_settings = {k: (int(v) if v.isnumeric() else v) for k, v in model_settings_list}
|
||||||
|
assert model_settings == expected_model_settigns
|
||||||
|
|
||||||
# Unit tests
|
# Unit tests
|
||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
@ -294,6 +398,20 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc
|
|||||||
with pytest.raises(DocumentStoreError):
|
with pytest.raises(DocumentStoreError):
|
||||||
mocked_document_store.query_by_embedding(self.query_emb)
|
mocked_document_store.query_by_embedding(self.query_emb)
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_query_by_embedding_raises_if_ivf_untrained(self, mocked_document_store):
|
||||||
|
mocked_document_store.index_type = "ivf"
|
||||||
|
mocked_document_store.ivf_train_size = 10
|
||||||
|
with pytest.raises(DocumentStoreError, match="Index of type 'ivf' is not trained yet."):
|
||||||
|
mocked_document_store.query_by_embedding(self.query_emb)
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_query_by_embedding_batch_if_ivf_untrained(self, mocked_document_store):
|
||||||
|
mocked_document_store.index_type = "ivf"
|
||||||
|
mocked_document_store.ivf_train_size = 10
|
||||||
|
with pytest.raises(DocumentStoreError, match="Index of type 'ivf' is not trained yet."):
|
||||||
|
mocked_document_store.query_by_embedding_batch([self.query_emb])
|
||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_query_by_embedding_filters(self, mocked_document_store):
|
def test_query_by_embedding_filters(self, mocked_document_store):
|
||||||
assert mocked_document_store.knn_engine != "score_script"
|
assert mocked_document_store.knn_engine != "score_script"
|
||||||
@ -649,8 +767,10 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc
|
|||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test__init_indices_creates_index_if_exists_and_recreate_index(self, mocked_document_store):
|
def test__init_indices_creates_index_if_exists_and_recreate_index(self, mocked_document_store):
|
||||||
# delete_index askes twice + one check for each index creation
|
# delete_index asks four times: one check for doc index, one check for label index
|
||||||
mocked_document_store.client.indices.exists.side_effect = [True, True, False, False]
|
# + one check for both if ivf model exists
|
||||||
|
# create_index asks two times: one for doc index, one for label index
|
||||||
|
mocked_document_store.client.indices.exists.side_effect = [True, False, True, False, False, False]
|
||||||
mocked_document_store._init_indices(self.index_name, "label_index", create_index=True, recreate_index=True)
|
mocked_document_store._init_indices(self.index_name, "label_index", create_index=True, recreate_index=True)
|
||||||
|
|
||||||
mocked_document_store.client.indices.delete.assert_called()
|
mocked_document_store.client.indices.delete.assert_called()
|
||||||
@ -824,7 +944,7 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc
|
|||||||
}
|
}
|
||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test__get_embedding_field_mapping_hnsw(self, mocked_document_store):
|
def test__get_embedding_field_mapping_default_hnsw(self, mocked_document_store):
|
||||||
mocked_document_store.index_type = "hnsw"
|
mocked_document_store.index_type = "hnsw"
|
||||||
|
|
||||||
assert mocked_document_store._get_embedding_field_mapping() == {
|
assert mocked_document_store._get_embedding_field_mapping() == {
|
||||||
@ -839,7 +959,7 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc
|
|||||||
}
|
}
|
||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test__get_embedding_field_mapping_hnsw_faiss(self, mocked_document_store):
|
def test__get_embedding_field_mapping_default_hnsw_faiss(self, mocked_document_store):
|
||||||
mocked_document_store.index_type = "hnsw"
|
mocked_document_store.index_type = "hnsw"
|
||||||
mocked_document_store.knn_engine = "faiss"
|
mocked_document_store.knn_engine = "faiss"
|
||||||
|
|
||||||
@ -854,6 +974,127 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test__get_embedding_field_mapping_custom_hnsw(self, mocked_document_store):
|
||||||
|
mocked_document_store.index_type = "hnsw"
|
||||||
|
mocked_document_store.knn_parameters = {"ef_construction": 1, "m": 2}
|
||||||
|
|
||||||
|
assert mocked_document_store._get_embedding_field_mapping() == {
|
||||||
|
"type": "knn_vector",
|
||||||
|
"dimension": 768,
|
||||||
|
"method": {
|
||||||
|
"space_type": "innerproduct",
|
||||||
|
"engine": "nmslib",
|
||||||
|
"name": "hnsw",
|
||||||
|
"parameters": {"ef_construction": 1, "m": 2},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test__get_embedding_field_mapping_custom_hnsw_faiss(self, mocked_document_store):
|
||||||
|
mocked_document_store.index_type = "hnsw"
|
||||||
|
mocked_document_store.knn_engine = "faiss"
|
||||||
|
mocked_document_store.knn_parameters = {"ef_construction": 1, "m": 2, "ef_search": 3}
|
||||||
|
|
||||||
|
assert mocked_document_store._get_embedding_field_mapping() == {
|
||||||
|
"type": "knn_vector",
|
||||||
|
"dimension": 768,
|
||||||
|
"method": {
|
||||||
|
"space_type": "innerproduct",
|
||||||
|
"engine": "faiss",
|
||||||
|
"name": "hnsw",
|
||||||
|
"parameters": {"ef_construction": 1, "m": 2, "ef_search": 3},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test__get_embedding_field_mapping_ivf(self, mocked_document_store):
|
||||||
|
mocked_document_store.index_type = "ivf"
|
||||||
|
mocked_document_store.knn_engine = "faiss"
|
||||||
|
mocked_document_store.client.indices.exists.return_value = False
|
||||||
|
|
||||||
|
# Before training, IVF indices use HNSW with default settings
|
||||||
|
assert mocked_document_store._get_embedding_field_mapping() == {"type": "knn_vector", "dimension": 768}
|
||||||
|
|
||||||
|
# Assume we have trained the index
|
||||||
|
mocked_document_store.client.indices.exists.return_value = True
|
||||||
|
mocked_document_store.client.transport.perform_request.return_value = {
|
||||||
|
"took": 4,
|
||||||
|
"timed_out": False,
|
||||||
|
"_shards": {"total": 1, "successful": 1, "skipped": 0, "failed": 0},
|
||||||
|
"hits": {
|
||||||
|
"total": {"value": 1, "relation": "eq"},
|
||||||
|
"max_score": 1.0,
|
||||||
|
"hits": [
|
||||||
|
{
|
||||||
|
"_index": ".opensearch-knn-models",
|
||||||
|
"_type": "_doc",
|
||||||
|
"_id": "document-ivf",
|
||||||
|
"_score": 1.0,
|
||||||
|
"_source": {
|
||||||
|
"model_blob": "<SOME MODEL BLOB>",
|
||||||
|
"engine": "faiss",
|
||||||
|
"space_type": "innerproduct",
|
||||||
|
"description": "index_type:ivf nlist:4 nprobes:1",
|
||||||
|
"model_id": f"{mocked_document_store.index}-ivf",
|
||||||
|
"state": "created",
|
||||||
|
"error": "",
|
||||||
|
"dimension": 768,
|
||||||
|
"timestamp": "2023-01-25T16:04:21.284398Z",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
assert mocked_document_store._get_embedding_field_mapping() == {
|
||||||
|
"type": "knn_vector",
|
||||||
|
"model_id": f"{mocked_document_store.index}-ivf",
|
||||||
|
}
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test__get_embedding_field_mapping_ivfpq(self, mocked_document_store):
|
||||||
|
mocked_document_store.index_type = "ivf_pq"
|
||||||
|
mocked_document_store.knn_engine = "faiss"
|
||||||
|
mocked_document_store.client.indices.exists.return_value = False
|
||||||
|
|
||||||
|
# Before training, IVF indices use HNSW with default settings
|
||||||
|
assert mocked_document_store._get_embedding_field_mapping() == {"type": "knn_vector", "dimension": 768}
|
||||||
|
|
||||||
|
# Assume we have trained the index
|
||||||
|
mocked_document_store.client.indices.exists.return_value = True
|
||||||
|
mocked_document_store.client.transport.perform_request.return_value = {
|
||||||
|
"took": 4,
|
||||||
|
"timed_out": False,
|
||||||
|
"_shards": {"total": 1, "successful": 1, "skipped": 0, "failed": 0},
|
||||||
|
"hits": {
|
||||||
|
"total": {"value": 1, "relation": "eq"},
|
||||||
|
"max_score": 1.0,
|
||||||
|
"hits": [
|
||||||
|
{
|
||||||
|
"_index": ".opensearch-knn-models",
|
||||||
|
"_type": "_doc",
|
||||||
|
"_id": "document-ivf",
|
||||||
|
"_score": 1.0,
|
||||||
|
"_source": {
|
||||||
|
"model_blob": "<SOME MODEL BLOB>",
|
||||||
|
"engine": "faiss",
|
||||||
|
"space_type": "innerproduct",
|
||||||
|
"description": "index_type:ivf_pq nlist:4 nprobes:1 m:1 code_size:8",
|
||||||
|
"model_id": f"{mocked_document_store.index}-ivf",
|
||||||
|
"state": "created",
|
||||||
|
"error": "",
|
||||||
|
"dimension": 768,
|
||||||
|
"timestamp": "2023-01-25T16:04:21.284398Z",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
assert mocked_document_store._get_embedding_field_mapping() == {
|
||||||
|
"type": "knn_vector",
|
||||||
|
"model_id": f"{mocked_document_store.index}-ivf",
|
||||||
|
}
|
||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test__get_embedding_field_mapping_wrong(self, mocked_document_store, caplog):
|
def test__get_embedding_field_mapping_wrong(self, mocked_document_store, caplog):
|
||||||
mocked_document_store.index_type = "foo"
|
mocked_document_store.index_type = "foo"
|
||||||
@ -861,7 +1102,7 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc
|
|||||||
with caplog.at_level(logging.ERROR, logger="haystack.document_stores.opensearch"):
|
with caplog.at_level(logging.ERROR, logger="haystack.document_stores.opensearch"):
|
||||||
retval = mocked_document_store._get_embedding_field_mapping()
|
retval = mocked_document_store._get_embedding_field_mapping()
|
||||||
|
|
||||||
assert "Set index_type to either 'flat' or 'hnsw'" in caplog.text
|
assert "Set index_type to either 'flat', 'hnsw', 'ivf', or 'ivf_pq'" in caplog.text
|
||||||
assert retval == {
|
assert retval == {
|
||||||
"type": "knn_vector",
|
"type": "knn_vector",
|
||||||
"dimension": 768,
|
"dimension": 768,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user