Fix embeddings_field_supports_similarity of OpenSearchDocumentStore when creating index (#3030)

* fix embeddings_field_supports_similarity when creating index

* fix test
This commit is contained in:
tstadel 2022-08-12 11:19:59 +02:00 committed by GitHub
parent 26c938a8e6
commit 668fd548a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 23 deletions

View File

@ -427,26 +427,21 @@ class OpenSearchDocumentStore(BaseElasticsearchDocumentStore):
embedding_field_space_type = index_settings["knn.space_type"]
# embedding field with local space_type setting
else:
# embedding field with global space_type setting
if "method" not in mappings["properties"][self.embedding_field]:
embedding_field_space_type = index_settings["knn.space_type"]
# embedding field with local space_type setting
else:
embedding_field_space_type = mappings["properties"][self.embedding_field]["method"][
"space_type"
]
embedding_field_space_type = mappings["properties"][self.embedding_field]["method"][
"space_type"
]
embedding_field_similarity = self.space_type_to_similarity[embedding_field_space_type]
if embedding_field_similarity == self.similarity:
self.embeddings_field_supports_similarity = True
else:
logger.warning(
f"Embedding field '{self.embedding_field}' is optimized for similarity '{embedding_field_similarity}'. "
f"Falling back to slow exact vector calculation. "
f"Consider cloning the embedding field optimized for '{embedding_field_similarity}' by calling clone_embedding_field(similarity='{embedding_field_similarity}', ...) "
f"or creating a new index optimized for '{self.similarity}' by setting `similarity='{self.similarity}'` the first time you instantiate OpenSearchDocumentStore for the new index, "
f"e.g. `OpenSearchDocumentStore(index='my_new_{self.similarity}_index', similarity='{self.similarity}')`."
)
embedding_field_similarity = self.space_type_to_similarity[embedding_field_space_type]
if embedding_field_similarity == self.similarity:
self.embeddings_field_supports_similarity = True
else:
logger.warning(
f"Embedding field '{self.embedding_field}' is optimized for similarity '{embedding_field_similarity}'. "
f"Falling back to slow exact vector calculation. "
f"Consider cloning the embedding field optimized for '{embedding_field_similarity}' by calling clone_embedding_field(similarity='{embedding_field_similarity}', ...) "
f"or creating a new index optimized for '{self.similarity}' by setting `similarity='{self.similarity}'` the first time you instantiate OpenSearchDocumentStore for the new index, "
f"e.g. `OpenSearchDocumentStore(index='my_new_{self.similarity}_index', similarity='{self.similarity}')`."
)
# Adjust global ef_search setting. If not set, default is 512.
ef_search = index_settings.get("knn.algo_param", {"ef_search": 512}).get("ef_search", 512)
@ -498,6 +493,7 @@ class OpenSearchDocumentStore(BaseElasticsearchDocumentStore):
)
try:
self.embeddings_field_supports_similarity = True
self.client.indices.create(index=index_name, body=index_definition, headers=headers)
except RequestError as e:
# With multiple workers we need to avoid race conditions, where:

View File

@ -1,4 +1,3 @@
import sys
import logging
from unittest.mock import MagicMock
@ -390,9 +389,7 @@ class TestOpenSearchDocumentStore:
mocked_document_store.embedding_field = "vec"
mocked_document_store._create_document_index(self.index_name)
# FIXME: when `method` is missing from the field mapping, embeddings_field_supports_similarity is always
# False but I'm not sure this is by design
assert mocked_document_store.embeddings_field_supports_similarity is False
assert mocked_document_store.embeddings_field_supports_similarity is True
@pytest.mark.unit
def test__create_document_index_with_existing_mapping_similarity(self, mocked_document_store, index):
@ -494,6 +491,7 @@ class TestOpenSearchDocumentStore:
mocked_document_store._create_document_index(self.index_name)
_, kwargs = mocked_document_store.client.indices.create.call_args
assert kwargs["body"] == {"mappings": {"properties": {"a_number": {"type": "integer"}}}}
assert mocked_document_store.embeddings_field_supports_similarity is True
@pytest.mark.unit
def test__create_document_index_no_index_no_mapping(self, mocked_document_store):
@ -522,6 +520,7 @@ class TestOpenSearchDocumentStore:
},
"settings": {"analysis": {"analyzer": {"default": {"type": "standard"}}}, "index": {"knn": True}},
}
assert mocked_document_store.embeddings_field_supports_similarity is True
@pytest.mark.unit
def test__create_document_index_no_index_no_mapping_with_synonyms(self, mocked_document_store):
@ -563,6 +562,7 @@ class TestOpenSearchDocumentStore:
"index": {"knn": True},
},
}
assert mocked_document_store.embeddings_field_supports_similarity is True
@pytest.mark.unit
def test__create_document_index_no_index_no_mapping_with_embedding_field(self, mocked_document_store):
@ -597,6 +597,7 @@ class TestOpenSearchDocumentStore:
"index": {"knn": True, "knn.algo_param.ef_search": 20},
},
}
assert mocked_document_store.embeddings_field_supports_similarity is True
@pytest.mark.unit
def test__create_document_index_client_failure(self, mocked_document_store):