mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-29 11:50:34 +00:00

* ci: Use ruff in pre-commit to further limit complexity * Fix invalid escape sequences in Python code * Delete releasenotes/notes/ruff-4d2504d362035166.yaml
1302 lines
60 KiB
Python
1302 lines
60 KiB
Python
import os
|
|
import logging
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
import numpy as np
|
|
|
|
import opensearchpy
|
|
|
|
from haystack.document_stores.opensearch import (
|
|
OpenSearch,
|
|
OpenSearchDocumentStore,
|
|
RequestsHttpConnection,
|
|
Urllib3HttpConnection,
|
|
RequestError,
|
|
tqdm,
|
|
)
|
|
from haystack.errors import DocumentStoreError
|
|
from haystack.testing import DocumentStoreBaseTestAbstract
|
|
|
|
from .test_search_engine import SearchEngineDocumentStoreTestAbstract
|
|
|
|
|
|
class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDocumentStoreTestAbstract):
|
|
# Constants
|
|
query_emb = np.random.random_sample(size=(2, 2))
|
|
index_name = __name__
|
|
|
|
# Fixtures
|
|
|
|
@pytest.fixture
|
|
def ds(self):
|
|
"""
|
|
This fixture provides a working document store and takes care of keeping clean the
|
|
OS cluster used in the tests.
|
|
"""
|
|
labels_index_name = f"{self.index_name}_labels"
|
|
ds = OpenSearchDocumentStore(
|
|
index=self.index_name,
|
|
label_index=labels_index_name,
|
|
host=os.environ.get("OPENSEARCH_HOST", "localhost"),
|
|
create_index=True,
|
|
recreate_index=True,
|
|
)
|
|
|
|
yield ds
|
|
|
|
@pytest.fixture
|
|
def mocked_document_store(self, existing_index):
|
|
"""
|
|
The fixture provides an instance of a slightly customized
|
|
OpenSearchDocumentStore equipped with a mocked client
|
|
"""
|
|
|
|
class DSMock(OpenSearchDocumentStore):
|
|
# We mock a subclass to avoid messing up the actual class object
|
|
pass
|
|
|
|
opensearch_mock = MagicMock()
|
|
opensearch_mock.indices.exists.return_value = True
|
|
opensearch_mock.indices.get.return_value = {self.index_name: existing_index}
|
|
opensearch_mock.info.return_value = {"version": {"number": "1.3.5"}}
|
|
DSMock._init_client = MagicMock()
|
|
DSMock._init_client.configure_mock(return_value=opensearch_mock)
|
|
dsMock = DSMock()
|
|
return dsMock
|
|
|
|
@pytest.fixture
|
|
def mocked_open_search_init(self, monkeypatch):
|
|
mocked_init = MagicMock(return_value=None)
|
|
monkeypatch.setattr(OpenSearch, "__init__", mocked_init)
|
|
return mocked_init
|
|
|
|
@pytest.fixture
|
|
def _init_client_params(self):
|
|
"""
|
|
The fixture provides the required arguments to call OpenSearchDocumentStore._init_client
|
|
"""
|
|
return {
|
|
"host": "localhost",
|
|
"port": 9999,
|
|
"username": "user",
|
|
"password": "pass",
|
|
"aws4auth": None,
|
|
"scheme": "http",
|
|
"ca_certs": "ca_certs",
|
|
"verify_certs": True,
|
|
"timeout": 42,
|
|
"use_system_proxy": True,
|
|
}
|
|
|
|
@pytest.fixture
|
|
def existing_index(self):
|
|
return {
|
|
"aliases": {},
|
|
"mappings": {
|
|
"properties": {
|
|
"content": {"type": "text"},
|
|
"embedding": {
|
|
"type": "knn_vector",
|
|
"dimension": 768,
|
|
"method": {
|
|
"engine": "nmslib",
|
|
"space_type": "innerproduct",
|
|
"name": "hnsw",
|
|
"parameters": {"ef_construction": 512, "m": 16},
|
|
},
|
|
},
|
|
}
|
|
},
|
|
"settings": {
|
|
"index": {
|
|
"creation_date": "1658337984559",
|
|
"number_of_shards": "1",
|
|
"number_of_replicas": "1",
|
|
"uuid": "jU5KPBtXQHOaIn2Cm2d4jg",
|
|
"version": {"created": "135238227"},
|
|
"provided_name": "existing_index",
|
|
}
|
|
},
|
|
}
|
|
|
|
# Integration tests
|
|
|
|
@pytest.mark.integration
|
|
def test___init__(self):
|
|
OpenSearchDocumentStore(index="nmslib_index", create_index=True)
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("index_type", ["flat", "hnsw", "ivf", "ivf_pq"])
|
|
def test___init___faiss(self, index_type):
|
|
OpenSearchDocumentStore(
|
|
index=f"faiss_index_{index_type}", recreate_index=True, knn_engine="faiss", index_type=index_type
|
|
)
|
|
|
|
@pytest.mark.integration
|
|
def test___init___score_script(self):
|
|
OpenSearchDocumentStore(index="score_script_index", create_index=True, knn_engine="score_script")
|
|
|
|
@pytest.mark.integration
|
|
def test_recreate_index(self, ds, documents, labels):
|
|
ds.write_documents(documents)
|
|
ds.write_labels(labels)
|
|
|
|
# Create another document store on top of the previous one
|
|
ds = OpenSearchDocumentStore(index=ds.index, label_index=ds.label_index, recreate_index=True)
|
|
assert len(ds.get_all_documents(index=ds.index)) == 0
|
|
assert len(ds.get_all_labels(index=ds.label_index)) == 0
|
|
|
|
@pytest.mark.integration
|
|
def test_clone_embedding_field(self, ds, documents):
|
|
cloned_field_name = "cloned"
|
|
ds.write_documents(documents)
|
|
ds.clone_embedding_field(cloned_field_name, "cosine")
|
|
for doc in ds.get_all_documents():
|
|
meta = doc.to_dict()["meta"]
|
|
if "no_embedding" in meta:
|
|
# docs with no embedding should be ignored
|
|
assert cloned_field_name not in meta
|
|
else:
|
|
# docs with an original embedding should have the new one
|
|
assert cloned_field_name in meta
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("knn_engine", ["nmslib", "faiss", "score_script"])
|
|
def test_query_embedding_with_filters(self, ds: OpenSearchDocumentStore, documents, knn_engine):
|
|
# Create another document store on top of the previous one
|
|
ds = OpenSearchDocumentStore(
|
|
index=ds.index, label_index=ds.label_index, recreate_index=True, knn_engine=knn_engine
|
|
)
|
|
ds.write_documents(documents)
|
|
results = ds.query_by_embedding(
|
|
query_emb=np.random.rand(768).astype(np.float32), filters={"year": "2020"}, top_k=10
|
|
)
|
|
assert len(results) == 3
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("use_ann", [True, False])
|
|
def test_query_embedding_batch_with_filters(self, ds: OpenSearchDocumentStore, documents, use_ann):
|
|
ds.embeddings_field_supports_similarity = use_ann
|
|
ds.write_documents(documents)
|
|
results = ds.query_by_embedding_batch(
|
|
query_embs=[np.random.rand(768).astype(np.float32) for _ in range(2)],
|
|
filters=[{"year": "2020"} for _ in range(2)],
|
|
top_k=10,
|
|
)
|
|
assert len(results) == 2
|
|
for result in results:
|
|
assert len(result) == 3
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("index_type", ["ivf", "ivf_pq"])
|
|
def test_train_index_from_documents(self, ds: OpenSearchDocumentStore, documents, index_type):
|
|
# Create another document store on top of the previous one
|
|
ds = OpenSearchDocumentStore(
|
|
index=ds.index,
|
|
label_index=ds.label_index,
|
|
recreate_index=True,
|
|
knn_engine="faiss",
|
|
index_type=index_type,
|
|
knn_parameters={"code_size": 2},
|
|
)
|
|
|
|
# Check that IVF indices use score_script before training
|
|
emb_field_settings = ds.client.indices.get(ds.index)[ds.index]["mappings"]["properties"][ds.embedding_field]
|
|
assert emb_field_settings == {"type": "knn_vector", "dimension": 768}
|
|
|
|
ds.train_index(documents)
|
|
# Check that embedding_field_settings have been updated
|
|
emb_field_settings = ds.client.indices.get(ds.index)[ds.index]["mappings"]["properties"][ds.embedding_field]
|
|
assert emb_field_settings == {"type": "knn_vector", "model_id": f"{ds.index}-ivf"}
|
|
|
|
# Check that model uses expected parameters
|
|
expected_model_settigns = {"index_type": index_type, "nlist": 4, "nprobes": 1}
|
|
if index_type == "ivf_pq":
|
|
expected_model_settigns["code_size"] = 2
|
|
expected_model_settigns["m"] = 1
|
|
model_endpoint = f"/_plugins/_knn/models/{ds.index}-ivf"
|
|
response = ds.client.transport.perform_request("GET", url=model_endpoint)
|
|
model_settings_list = [setting.split(":") for setting in response["description"].split()]
|
|
model_settings = {k: (int(v) if v.isnumeric() else v) for k, v in model_settings_list}
|
|
assert model_settings == expected_model_settigns
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("index_type", ["ivf", "ivf_pq"])
|
|
def test_train_index_from_embeddings(self, ds: OpenSearchDocumentStore, documents, index_type):
|
|
# Create another document store on top of the previous one
|
|
ds = OpenSearchDocumentStore(
|
|
index=ds.index,
|
|
label_index=ds.label_index,
|
|
recreate_index=True,
|
|
knn_engine="faiss",
|
|
index_type=index_type,
|
|
knn_parameters={"code_size": 2},
|
|
)
|
|
|
|
# Check that IVF indices use HNSW with default settings before training
|
|
emb_field_settings = ds.client.indices.get(ds.index)[ds.index]["mappings"]["properties"][ds.embedding_field]
|
|
assert emb_field_settings == {"type": "knn_vector", "dimension": 768}
|
|
|
|
embeddings = np.array([doc.embedding for doc in documents if doc.embedding is not None])
|
|
ds.train_index(embeddings=embeddings)
|
|
# Check that embedding_field_settings have been updated
|
|
emb_field_settings = ds.client.indices.get(ds.index)[ds.index]["mappings"]["properties"][ds.embedding_field]
|
|
assert emb_field_settings == {"type": "knn_vector", "model_id": f"{ds.index}-ivf"}
|
|
|
|
# Check that model uses expected parameters
|
|
expected_model_settigns = {"index_type": index_type, "nlist": 4, "nprobes": 1}
|
|
if index_type == "ivf_pq":
|
|
expected_model_settigns["code_size"] = 2
|
|
expected_model_settigns["m"] = 1
|
|
model_endpoint = f"/_plugins/_knn/models/{ds.index}-ivf"
|
|
response = ds.client.transport.perform_request("GET", url=model_endpoint)
|
|
model_settings_list = [setting.split(":") for setting in response["description"].split()]
|
|
model_settings = {k: (int(v) if v.isnumeric() else v) for k, v in model_settings_list}
|
|
assert model_settings == expected_model_settigns
|
|
|
|
@pytest.mark.integration
|
|
@pytest.mark.parametrize("index_type", ["ivf", "ivf_pq"])
|
|
def test_train_index_with_write_documents(self, ds: OpenSearchDocumentStore, documents, index_type):
|
|
# Create another document store on top of the previous one
|
|
ds = OpenSearchDocumentStore(
|
|
index=ds.index,
|
|
label_index=ds.label_index,
|
|
recreate_index=True,
|
|
knn_engine="faiss",
|
|
index_type=index_type,
|
|
knn_parameters={"code_size": 2},
|
|
ivf_train_size=6,
|
|
)
|
|
|
|
# Check that IVF indices use HNSW with default settings before training
|
|
emb_field_settings = ds.client.indices.get(ds.index)[ds.index]["mappings"]["properties"][ds.embedding_field]
|
|
assert emb_field_settings == {"type": "knn_vector", "dimension": 768}
|
|
|
|
ds.write_documents(documents)
|
|
# Check that embedding_field_settings have been updated
|
|
emb_field_settings = ds.client.indices.get(ds.index)[ds.index]["mappings"]["properties"][ds.embedding_field]
|
|
assert emb_field_settings == {"type": "knn_vector", "model_id": f"{ds.index}-ivf"}
|
|
|
|
# Check that model uses expected parameters
|
|
expected_model_settigns = {"index_type": index_type, "nlist": 4, "nprobes": 1}
|
|
if index_type == "ivf_pq":
|
|
expected_model_settigns["code_size"] = 2
|
|
expected_model_settigns["m"] = 1
|
|
model_endpoint = f"/_plugins/_knn/models/{ds.index}-ivf"
|
|
response = ds.client.transport.perform_request("GET", url=model_endpoint)
|
|
model_settings_list = [setting.split(":") for setting in response["description"].split()]
|
|
model_settings = {k: (int(v) if v.isnumeric() else v) for k, v in model_settings_list}
|
|
assert model_settings == expected_model_settigns
|
|
|
|
# Unit tests
|
|
|
|
@pytest.mark.unit
|
|
def test___init___api_key_raises_warning(self, mocked_document_store, caplog):
|
|
with caplog.at_level(logging.WARN, logger="haystack.document_stores.opensearch"):
|
|
mocked_document_store.__init__(api_key="foo")
|
|
mocked_document_store.__init__(api_key_id="bar")
|
|
mocked_document_store.__init__(api_key="foo", api_key_id="bar")
|
|
|
|
assert len(caplog.records) == 3
|
|
for r in caplog.records:
|
|
assert r.levelname == "WARNING"
|
|
|
|
@pytest.mark.unit
|
|
def test__init_client_aws4auth_and_username_raises_warning(self, mocked_open_search_init, caplog):
|
|
_init_client_remaining_kwargs = {
|
|
"host": "host",
|
|
"port": 443,
|
|
"password": "pass",
|
|
"scheme": "https",
|
|
"ca_certs": None,
|
|
"verify_certs": True,
|
|
"timeout": 10,
|
|
"use_system_proxy": False,
|
|
}
|
|
|
|
with caplog.at_level(logging.WARN, logger="haystack.document_stores.opensearch"):
|
|
OpenSearchDocumentStore._init_client(username="admin", aws4auth="foo", **_init_client_remaining_kwargs)
|
|
OpenSearchDocumentStore._init_client(username="bar", aws4auth="foo", **_init_client_remaining_kwargs)
|
|
assert len(caplog.records) == 2
|
|
for r in caplog.records:
|
|
assert r.levelname == "WARNING"
|
|
|
|
caplog.clear()
|
|
with caplog.at_level(logging.WARN, logger="haystack.document_stores.opensearch"):
|
|
OpenSearchDocumentStore._init_client(username=None, aws4auth="foo", **_init_client_remaining_kwargs)
|
|
OpenSearchDocumentStore._init_client(username="foo", aws4auth=None, **_init_client_remaining_kwargs)
|
|
assert len(caplog.records) == 0
|
|
|
|
@pytest.mark.unit
|
|
def test___init___connection_test_fails(self, mocked_document_store):
|
|
failing_client = MagicMock()
|
|
failing_client.indices.get.side_effect = Exception("The client failed!")
|
|
mocked_document_store._init_client.return_value = failing_client
|
|
with pytest.raises(ConnectionError):
|
|
mocked_document_store.__init__()
|
|
|
|
@pytest.mark.unit
|
|
def test___init___client_params(self, mocked_open_search_init, _init_client_params):
|
|
"""
|
|
Ensure the Opensearch-py client was initialized with the right params
|
|
"""
|
|
OpenSearchDocumentStore._init_client(**_init_client_params)
|
|
assert mocked_open_search_init.called
|
|
_, kwargs = mocked_open_search_init.call_args
|
|
assert kwargs == {
|
|
"hosts": [{"host": "localhost", "port": 9999}],
|
|
"http_auth": ("user", "pass"),
|
|
"scheme": "http",
|
|
"ca_certs": "ca_certs",
|
|
"verify_certs": True,
|
|
"timeout": 42,
|
|
"connection_class": RequestsHttpConnection,
|
|
}
|
|
|
|
@pytest.mark.unit
|
|
def test__init_client_use_system_proxy_use_sys_proxy(self, mocked_open_search_init, _init_client_params):
|
|
_init_client_params["use_system_proxy"] = False
|
|
OpenSearchDocumentStore._init_client(**_init_client_params)
|
|
_, kwargs = mocked_open_search_init.call_args
|
|
assert kwargs["connection_class"] == Urllib3HttpConnection
|
|
|
|
@pytest.mark.unit
|
|
def test__init_client_use_system_proxy_dont_use_sys_proxy(self, mocked_open_search_init, _init_client_params):
|
|
_init_client_params["use_system_proxy"] = True
|
|
OpenSearchDocumentStore._init_client(**_init_client_params)
|
|
_, kwargs = mocked_open_search_init.call_args
|
|
assert kwargs["connection_class"] == RequestsHttpConnection
|
|
|
|
@pytest.mark.unit
|
|
def test__init_client_auth_methods_username_password(self, mocked_open_search_init, _init_client_params):
|
|
_init_client_params["username"] = "user"
|
|
_init_client_params["aws4auth"] = None
|
|
OpenSearchDocumentStore._init_client(**_init_client_params)
|
|
_, kwargs = mocked_open_search_init.call_args
|
|
assert kwargs["http_auth"] == ("user", "pass")
|
|
|
|
@pytest.mark.unit
|
|
def test__init_client_auth_methods_aws_iam(self, mocked_open_search_init, _init_client_params):
|
|
_init_client_params["username"] = ""
|
|
_init_client_params["aws4auth"] = "foo"
|
|
OpenSearchDocumentStore._init_client(**_init_client_params)
|
|
_, kwargs = mocked_open_search_init.call_args
|
|
assert kwargs["http_auth"] == "foo"
|
|
|
|
@pytest.mark.unit
|
|
def test__init_client_auth_methods_no_auth(self, mocked_open_search_init, _init_client_params):
|
|
_init_client_params["username"] = ""
|
|
_init_client_params["aws4auth"] = None
|
|
OpenSearchDocumentStore._init_client(**_init_client_params)
|
|
_, kwargs = mocked_open_search_init.call_args
|
|
assert "http_auth" not in kwargs
|
|
|
|
@pytest.mark.unit
|
|
def test_query(self, mocked_document_store):
|
|
mocked_document_store.query(query=self.query)
|
|
kwargs = mocked_document_store.client.search.call_args.kwargs
|
|
assert "index" in kwargs
|
|
assert "body" in kwargs
|
|
assert "headers" in kwargs
|
|
|
|
@pytest.mark.unit
|
|
def test_query_return_embedding_false(self, mocked_document_store):
|
|
mocked_document_store.return_embedding = False
|
|
mocked_document_store.query(self.query)
|
|
# assert the resulting body is consistent with the `excluded_meta_data` value
|
|
_, kwargs = mocked_document_store.client.search.call_args
|
|
assert kwargs["body"]["_source"] == {"excludes": ["embedding"]}
|
|
|
|
@pytest.mark.unit
|
|
def test_query_excluded_meta_data_return_embedding_true(self, mocked_document_store):
|
|
mocked_document_store.return_embedding = True
|
|
mocked_document_store.excluded_meta_data = ["foo", "embedding"]
|
|
mocked_document_store.query(self.query)
|
|
_, kwargs = mocked_document_store.client.search.call_args
|
|
# we expect "embedding" was removed from the final query
|
|
assert kwargs["body"]["_source"] == {"excludes": ["foo"]}
|
|
|
|
@pytest.mark.unit
|
|
def test_query_excluded_meta_data_return_embedding_false(self, mocked_document_store):
|
|
mocked_document_store.return_embedding = False
|
|
mocked_document_store.excluded_meta_data = ["foo"]
|
|
mocked_document_store.query(self.query)
|
|
# assert the resulting body is consistent with the `excluded_meta_data` value
|
|
_, kwargs = mocked_document_store.client.search.call_args
|
|
assert kwargs["body"]["_source"] == {"excludes": ["foo", "embedding"]}
|
|
|
|
@pytest.mark.unit
|
|
def test_query_by_embedding_raises_if_missing_field(self, mocked_document_store):
|
|
mocked_document_store.embedding_field = ""
|
|
with pytest.raises(DocumentStoreError):
|
|
mocked_document_store.query_by_embedding(self.query_emb)
|
|
|
|
@pytest.mark.unit
|
|
def test_query_by_embedding_raises_if_ivf_untrained(self, mocked_document_store):
|
|
mocked_document_store.index_type = "ivf"
|
|
mocked_document_store.ivf_train_size = 10
|
|
with pytest.raises(DocumentStoreError, match="Index of type 'ivf' is not trained yet."):
|
|
mocked_document_store.query_by_embedding(self.query_emb)
|
|
|
|
@pytest.mark.unit
|
|
def test_query_by_embedding_batch_if_ivf_untrained(self, mocked_document_store):
|
|
mocked_document_store.index_type = "ivf"
|
|
mocked_document_store.ivf_train_size = 10
|
|
with pytest.raises(DocumentStoreError, match="Index of type 'ivf' is not trained yet."):
|
|
mocked_document_store.query_by_embedding_batch([self.query_emb])
|
|
|
|
@pytest.mark.unit
|
|
def test_query_by_embedding_filters(self, mocked_document_store):
|
|
assert mocked_document_store.knn_engine != "score_script"
|
|
expected_filters = {"type": "article", "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}}
|
|
mocked_document_store.query_by_embedding(self.query_emb, filters=expected_filters)
|
|
# Assert the `search` method on the client was called with the filters we provided
|
|
_, kwargs = mocked_document_store.client.search.call_args
|
|
actual_filters = kwargs["body"]["query"]["bool"]["filter"]
|
|
assert actual_filters["bool"]["must"] == [
|
|
{"term": {"type": "article"}},
|
|
{"range": {"date": {"gte": "2015-01-01", "lt": "2021-01-01"}}},
|
|
]
|
|
|
|
@pytest.mark.unit
|
|
def test_query_by_embedding_script_score_filters(self, mocked_document_store):
|
|
mocked_document_store.knn_engine = "score_script"
|
|
expected_filters = {"type": "article", "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}}
|
|
mocked_document_store.query_by_embedding(self.query_emb, filters=expected_filters)
|
|
# Assert the `search` method on the client was called with the filters we provided
|
|
_, kwargs = mocked_document_store.client.search.call_args
|
|
actual_filters = kwargs["body"]["query"]["script_score"]["query"]["bool"]["filter"]
|
|
assert actual_filters["bool"]["must"] == [
|
|
{"term": {"type": "article"}},
|
|
{"range": {"date": {"gte": "2015-01-01", "lt": "2021-01-01"}}},
|
|
]
|
|
|
|
@pytest.mark.unit
|
|
def test_query_by_embedding_return_embedding_false(self, mocked_document_store):
|
|
mocked_document_store.return_embedding = False
|
|
mocked_document_store.query_by_embedding(self.query_emb)
|
|
# assert the resulting body is consistent with the `excluded_meta_data` value
|
|
_, kwargs = mocked_document_store.client.search.call_args
|
|
assert kwargs["body"]["_source"] == {"excludes": ["embedding"]}
|
|
|
|
@pytest.mark.unit
|
|
def test_query_by_embedding_excluded_meta_data_return_embedding_true(self, mocked_document_store):
|
|
"""
|
|
Test that when `return_embedding==True` the field should NOT be excluded even if it
|
|
was added to `excluded_meta_data`
|
|
"""
|
|
mocked_document_store.return_embedding = True
|
|
mocked_document_store.excluded_meta_data = ["foo", "embedding"]
|
|
mocked_document_store.query_by_embedding(self.query_emb)
|
|
_, kwargs = mocked_document_store.client.search.call_args
|
|
# we expect "embedding" was removed from the final query
|
|
assert kwargs["body"]["_source"] == {"excludes": ["foo"]}
|
|
|
|
@pytest.mark.unit
|
|
def test_query_by_embedding_excluded_meta_data_return_embedding_false(self, mocked_document_store):
|
|
"""
|
|
Test that when `return_embedding==False`, the final query excludes the `embedding` field
|
|
even if it wasn't explicitly added to `excluded_meta_data`
|
|
"""
|
|
mocked_document_store.return_embedding = False
|
|
mocked_document_store.excluded_meta_data = ["foo"]
|
|
mocked_document_store.query_by_embedding(self.query_emb)
|
|
# assert the resulting body is consistent with the `excluded_meta_data` value
|
|
_, kwargs = mocked_document_store.client.search.call_args
|
|
assert kwargs["body"]["_source"] == {"excludes": ["foo", "embedding"]}
|
|
|
|
@pytest.mark.unit
|
|
def test_query_by_embedding_batch_uses_msearch(self, mocked_document_store):
|
|
mocked_document_store.query_by_embedding_batch([self.query_emb for _ in range(10)])
|
|
# assert the resulting body is consistent with the `excluded_meta_data` value
|
|
_, kwargs = mocked_document_store.client.msearch.call_args
|
|
assert len(kwargs["body"]) == 20 # each search has headers and request
|
|
|
|
@pytest.mark.unit
|
|
def test__init_indices_with_alias(self, mocked_document_store, caplog):
|
|
mocked_document_store.client.indices.exists_alias.return_value = True
|
|
|
|
with caplog.at_level(logging.DEBUG, logger="haystack.document_stores.search_engine"):
|
|
mocked_document_store._init_indices(self.index_name, "labels", False, False)
|
|
|
|
assert f"Index name {self.index_name} is an alias." in caplog.text
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_wrong_mapping_raises(self, mocked_document_store, existing_index):
|
|
"""
|
|
Ensure the method raises if we specify a field in `search_fields` that's not text
|
|
"""
|
|
existing_index["mappings"]["properties"]["age"] = {"type": "integer"}
|
|
mocked_document_store.search_fields = ["age"]
|
|
with pytest.raises(
|
|
DocumentStoreError,
|
|
match=f"The index '{self.index_name}' needs the 'text' type for the search_field 'age' to run full text search, but got type 'integer'.",
|
|
):
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_create_embedding_mapping_if_missing(self, mocked_document_store):
|
|
mocked_document_store.embedding_field = "doesnt_have_a_mapping"
|
|
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
# Assert the expected body was passed to the client
|
|
_, kwargs = mocked_document_store.client.indices.put_mapping.call_args
|
|
assert kwargs["index"] == self.index_name
|
|
assert kwargs["body"]["properties"]["doesnt_have_a_mapping"]["type"] == "knn_vector"
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_create_search_field_mapping_if_missing(self, mocked_document_store):
|
|
mocked_document_store.search_fields = ["doesnt_have_a_mapping"]
|
|
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
# Assert the expected body was passed to the client
|
|
_, kwargs = mocked_document_store.client.indices.put_mapping.call_args
|
|
assert kwargs["index"] == self.index_name
|
|
assert kwargs["body"]["properties"]["doesnt_have_a_mapping"]["type"] == "text"
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_with_bad_field_raises(self, mocked_document_store, existing_index):
|
|
existing_index["mappings"]["properties"]["age"] = {"type": "integer"}
|
|
mocked_document_store.embedding_field = "age"
|
|
with pytest.raises(
|
|
DocumentStoreError,
|
|
match=f"The index '{self.index_name}' needs the 'knn_vector' type for the embedding_field 'age' to run vector search, but got type 'integer'.",
|
|
):
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_but_no_method(self, mocked_document_store, existing_index):
|
|
"""
|
|
We call the method passing a properly mapped field but without the `method` specified in the mapping
|
|
"""
|
|
del existing_index["mappings"]["properties"]["embedding"]["method"]
|
|
|
|
assert mocked_document_store.space_type == "innerproduct"
|
|
with pytest.raises(
|
|
DocumentStoreError,
|
|
match=rf"Set `similarity` to one of '\['l2'\]' to properly use the embedding field 'embedding' of index '{self.index_name}'. Similarity 'dot_product' is not compatible with embedding field's space type 'l2', it requires 'innerproduct'.",
|
|
):
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
# l2 is default for space_type so it must pass
|
|
mocked_document_store.space_type = "l2"
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_similarity(self, mocked_document_store):
|
|
mocked_document_store.space_type = "innerproduct"
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_similarity_mismatch(self, mocked_document_store):
|
|
mocked_document_store.space_type = "cosinesimil"
|
|
|
|
with pytest.raises(
|
|
DocumentStoreError,
|
|
match=rf"Set `similarity` to one of '\['dot_product'\]' to properly use the embedding field 'embedding' of index '{self.index_name}'. Similarity 'dot_product' is not compatible with embedding field's space type 'innerproduct', it requires 'cosinesimil'.",
|
|
):
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_type_mismatch(self, mocked_document_store):
|
|
mocked_document_store.index_type = "hnsw"
|
|
|
|
with pytest.raises(
|
|
DocumentStoreError,
|
|
match=f"The index_type 'hnsw' needs '80' as ef_construction value. Currently, the value for embedding field 'embedding' of index '{self.index_name}' is '512'.",
|
|
):
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_change_knn_engine_to_faiss(self, mocked_document_store):
|
|
mocked_document_store.knn_engine = "faiss"
|
|
with pytest.raises(
|
|
DocumentStoreError,
|
|
match=f"Existing embedding field '{mocked_document_store.embedding_field}' of OpenSearch index '{self.index_name}' has knn_engine 'nmslib', but knn_engine was set to 'faiss'.",
|
|
):
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_change_knn_engine_to_score_script(self, mocked_document_store):
|
|
mocked_document_store.knn_engine = "score_script"
|
|
mocked_document_store.space_type = "cosinesimil"
|
|
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_adjusts_ef_search_for_hnsw_when_default(
|
|
self, mocked_document_store, existing_index
|
|
):
|
|
"""
|
|
Test adjustment when `knn.algo_param` is missing from the index settings
|
|
"""
|
|
existing_index["mappings"]["properties"]["embedding"]["method"]["parameters"]["ef_construction"] = 80
|
|
existing_index["mappings"]["properties"]["embedding"]["method"]["parameters"]["m"] = 64
|
|
mocked_document_store.index_type = "hnsw"
|
|
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
# assert the resulting body contains the adjusted params
|
|
_, kwargs = mocked_document_store.client.indices.put_settings.call_args
|
|
assert kwargs["body"] == {"knn.algo_param.ef_search": 20}
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_adjusts_ef_search_for_hnsw_when_set_different(
|
|
self, mocked_document_store, existing_index
|
|
):
|
|
"""
|
|
Test a value of `knn.algo_param` that needs to be adjusted
|
|
"""
|
|
existing_index["mappings"]["properties"]["embedding"]["method"]["parameters"]["ef_construction"] = 80
|
|
existing_index["mappings"]["properties"]["embedding"]["method"]["parameters"]["m"] = 64
|
|
existing_index["settings"]["index"]["knn.algo_param"] = {"ef_search": 999}
|
|
mocked_document_store.index_type = "hnsw"
|
|
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
# assert the resulting body is contains the adjusted params
|
|
_, kwargs = mocked_document_store.client.indices.put_settings.call_args
|
|
assert kwargs["body"] == {"knn.algo_param.ef_search": 20}
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_ignores_index_setting_ef_search_for_faiss(
|
|
self, mocked_document_store, existing_index
|
|
):
|
|
mocked_document_store.knn_engine = "faiss"
|
|
existing_index["mappings"]["properties"]["embedding"]["method"]["engine"] = "faiss"
|
|
existing_index["mappings"]["properties"]["embedding"]["method"]["parameters"]["ef_construction"] = 512
|
|
existing_index["mappings"]["properties"]["embedding"]["method"]["parameters"]["m"] = 16
|
|
existing_index["settings"]["index"]["knn.algo_param"] = {"ef_search": 999}
|
|
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
mocked_document_store.client.indices.put_settings.assert_not_called()
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_ignores_parameter_ef_search_for_nmslib(
|
|
self, mocked_document_store, existing_index
|
|
):
|
|
existing_index["mappings"]["properties"]["embedding"]["method"]["parameters"]["ef_construction"] = 512
|
|
existing_index["mappings"]["properties"]["embedding"]["method"]["parameters"]["m"] = 16
|
|
existing_index["mappings"]["properties"]["embedding"]["method"]["parameters"]["ef_search"] = 999
|
|
existing_index["settings"]["index"]["knn.algo_param"] = {"ef_search": 512}
|
|
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
mocked_document_store.client.indices.put_settings.assert_not_called()
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_does_not_adjust_ef_search_for_hnsw_when_set_correct(
|
|
self, mocked_document_store, existing_index
|
|
):
|
|
"""
|
|
If params are already set correctly, we should not adjust them.
|
|
"""
|
|
existing_index["mappings"]["properties"]["embedding"]["method"]["parameters"]["ef_construction"] = 80
|
|
existing_index["mappings"]["properties"]["embedding"]["method"]["parameters"]["m"] = 64
|
|
existing_index["settings"]["index"]["knn.algo_param"] = {"ef_search": 20}
|
|
mocked_document_store.index_type = "hnsw"
|
|
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
mocked_document_store.client.indices.put_settings.assert_not_called()
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_adjusts_ef_search_for_flat_when_set_different(
|
|
self, mocked_document_store, existing_index
|
|
):
|
|
"""
|
|
Test a value of `knn.algo_param` that needs to be adjusted
|
|
"""
|
|
existing_index["settings"]["index"]["knn.algo_param"] = {"ef_search": 999}
|
|
mocked_document_store.index_type = "flat"
|
|
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
|
|
# assert the resulting body is contains the adjusted params
|
|
_, kwargs = mocked_document_store.client.indices.put_settings.call_args
|
|
assert kwargs["body"] == {"knn.algo_param.ef_search": 512}
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_does_not_adjust_ef_search_for_flat_when_default(
|
|
self, mocked_document_store
|
|
):
|
|
"""
|
|
If `knn.algo_param` is missing, default value needs no adjustments
|
|
"""
|
|
mocked_document_store.index_type = "flat"
|
|
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
mocked_document_store.client.indices.put_settings.assert_not_called()
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_does_not_adjust_ef_search_for_flat_when_set_correct(
|
|
self, mocked_document_store, existing_index
|
|
):
|
|
"""
|
|
If `knn.algo_param` is correct, value needs no adjustments
|
|
"""
|
|
existing_index["settings"]["index"]["knn.algo_param"] = {"ef_search": 512}
|
|
mocked_document_store.index_type = "flat"
|
|
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
mocked_document_store.client.indices.put_settings.assert_not_called()
|
|
|
|
@pytest.mark.unit
|
|
def test__validate_and_adjust_document_index_with_non_existing_index(self, mocked_document_store, caplog):
|
|
mocked_document_store.client.indices.get.return_value = {}
|
|
with caplog.at_level(logging.WARNING):
|
|
mocked_document_store._validate_and_adjust_document_index(self.index_name)
|
|
assert f"The index '{self.index_name}' doesn't exist. " in caplog.text
|
|
|
|
@pytest.mark.unit
|
|
@pytest.mark.parametrize("create_index", [True, False])
|
|
@pytest.mark.parametrize("recreate_index", [True, False])
|
|
def test__init_indices_always_calls_validation_if_no_custom_mapping(
|
|
self, mocked_document_store, create_index, recreate_index
|
|
):
|
|
mocked_document_store._validate_and_adjust_document_index = MagicMock()
|
|
mocked_document_store._init_indices(self.index_name, "label_index", create_index, recreate_index)
|
|
|
|
mocked_document_store._validate_and_adjust_document_index.assert_called_once()
|
|
|
|
@pytest.mark.unit
|
|
@pytest.mark.parametrize("create_index", [True, False])
|
|
@pytest.mark.parametrize("recreate_index", [True, False])
|
|
def test__init_indices_never_calls_validation_if_custom_mapping(
|
|
self, mocked_document_store, create_index, recreate_index, caplog
|
|
):
|
|
mocked_document_store.custom_mapping = {
|
|
"mappings": {"properties": {"embedding": {"type": "dense_vector", "dims": 768}}}
|
|
}
|
|
mocked_document_store._validate_and_adjust_document_index = MagicMock()
|
|
|
|
with caplog.at_level(logging.WARNING):
|
|
mocked_document_store._init_indices(self.index_name, "label_index", create_index, recreate_index)
|
|
assert "Skipping index validation" in caplog.text
|
|
mocked_document_store._validate_and_adjust_document_index.assert_not_called()
|
|
|
|
@pytest.mark.unit
|
|
def test__init_indices_creates_index_if_not_exists(self, mocked_document_store):
|
|
mocked_document_store.client.indices.exists.return_value = False
|
|
mocked_document_store._init_indices(self.index_name, "label_index", create_index=True, recreate_index=False)
|
|
|
|
mocked_document_store.client.indices.create.assert_called()
|
|
|
|
@pytest.mark.unit
|
|
def test__init_indices_does_not_create_index_if_exists(self, mocked_document_store):
|
|
mocked_document_store._init_indices(self.index_name, "label_index", create_index=True, recreate_index=False)
|
|
|
|
mocked_document_store.client.indices.create.assert_not_called()
|
|
|
|
@pytest.mark.unit
|
|
def test__init_indices_does_not_create_index_if_not_create_index(self, mocked_document_store):
|
|
mocked_document_store.client.indices.exists.return_value = False
|
|
mocked_document_store._init_indices(self.index_name, "label_index", create_index=False, recreate_index=False)
|
|
|
|
mocked_document_store.client.indices.create.assert_not_called()
|
|
|
|
@pytest.mark.unit
|
|
def test__init_indices_creates_index_if_exists_and_recreate_index(self, mocked_document_store):
|
|
# delete_index asks four times: one check for doc index, one check for label index
|
|
# + one check for both if ivf model exists
|
|
# create_index asks two times: one for doc index, one for label index
|
|
mocked_document_store.client.indices.exists.side_effect = [True, False, True, False, False, False]
|
|
mocked_document_store._init_indices(self.index_name, "label_index", create_index=True, recreate_index=True)
|
|
|
|
mocked_document_store.client.indices.delete.assert_called()
|
|
mocked_document_store.client.indices.create.assert_called()
|
|
|
|
@pytest.mark.unit
|
|
def test__create_document_index_no_index_custom_mapping(self, mocked_document_store):
|
|
mocked_document_store.custom_mapping = {"mappings": {"properties": {"a_number": {"type": "integer"}}}}
|
|
|
|
mocked_document_store._create_document_index(self.index_name)
|
|
_, kwargs = mocked_document_store.client.indices.create.call_args
|
|
assert kwargs["body"] == {"mappings": {"properties": {"a_number": {"type": "integer"}}}}
|
|
assert mocked_document_store.knn_engine == "nmslib"
|
|
assert mocked_document_store.space_type == "innerproduct"
|
|
|
|
@pytest.mark.unit
|
|
def test__create_document_index_no_index_no_mapping(self, mocked_document_store):
|
|
mocked_document_store._create_document_index(self.index_name)
|
|
_, kwargs = mocked_document_store.client.indices.create.call_args
|
|
assert kwargs["body"] == {
|
|
"mappings": {
|
|
"dynamic_templates": [
|
|
{"strings": {"mapping": {"type": "keyword"}, "match_mapping_type": "string", "path_match": "*"}}
|
|
],
|
|
"properties": {
|
|
"content": {"type": "text"},
|
|
"embedding": {
|
|
"dimension": 768,
|
|
"method": {
|
|
"engine": "nmslib",
|
|
"name": "hnsw",
|
|
"parameters": {"ef_construction": 512, "m": 16},
|
|
"space_type": "innerproduct",
|
|
},
|
|
"type": "knn_vector",
|
|
},
|
|
"name": {"type": "keyword"},
|
|
},
|
|
},
|
|
"settings": {"analysis": {"analyzer": {"default": {"type": "standard"}}}, "index": {"knn": True}},
|
|
}
|
|
assert mocked_document_store.knn_engine == "nmslib"
|
|
assert mocked_document_store.space_type == "innerproduct"
|
|
|
|
@pytest.mark.unit
|
|
def test__create_document_index_no_index_no_mapping_with_synonyms(self, mocked_document_store):
|
|
mocked_document_store.search_fields = ["occupation"]
|
|
mocked_document_store.synonyms = ["foo"]
|
|
|
|
mocked_document_store._create_document_index(self.index_name)
|
|
_, kwargs = mocked_document_store.client.indices.create.call_args
|
|
assert kwargs["body"] == {
|
|
"mappings": {
|
|
"properties": {
|
|
"name": {"type": "keyword"},
|
|
"content": {"type": "text", "analyzer": "synonym"},
|
|
"occupation": {"type": "text", "analyzer": "synonym"},
|
|
"embedding": {
|
|
"type": "knn_vector",
|
|
"dimension": 768,
|
|
"method": {
|
|
"space_type": "innerproduct",
|
|
"name": "hnsw",
|
|
"engine": "nmslib",
|
|
"parameters": {"ef_construction": 512, "m": 16},
|
|
},
|
|
},
|
|
},
|
|
"dynamic_templates": [
|
|
{"strings": {"path_match": "*", "match_mapping_type": "string", "mapping": {"type": "keyword"}}}
|
|
],
|
|
},
|
|
"settings": {
|
|
"analysis": {
|
|
"analyzer": {
|
|
"default": {"type": "standard"},
|
|
"synonym": {"tokenizer": "whitespace", "filter": ["lowercase", "synonym"]},
|
|
},
|
|
"filter": {"synonym": {"type": "synonym", "synonyms": ["foo"]}},
|
|
},
|
|
"index": {"knn": True},
|
|
},
|
|
}
|
|
assert mocked_document_store.knn_engine == "nmslib"
|
|
assert mocked_document_store.space_type == "innerproduct"
|
|
|
|
@pytest.mark.unit
|
|
def test__create_document_index_no_index_no_mapping_with_embedding_field(self, mocked_document_store):
|
|
mocked_document_store.embedding_field = "vec"
|
|
mocked_document_store.index_type = "hnsw"
|
|
|
|
mocked_document_store._create_document_index(self.index_name)
|
|
_, kwargs = mocked_document_store.client.indices.create.call_args
|
|
assert kwargs["body"] == {
|
|
"mappings": {
|
|
"properties": {
|
|
"name": {"type": "keyword"},
|
|
"content": {"type": "text"},
|
|
"vec": {
|
|
"type": "knn_vector",
|
|
"dimension": 768,
|
|
"method": {
|
|
"space_type": "innerproduct",
|
|
"name": "hnsw",
|
|
"engine": "nmslib",
|
|
"parameters": {"ef_construction": 80, "m": 64},
|
|
},
|
|
},
|
|
},
|
|
"dynamic_templates": [
|
|
{"strings": {"path_match": "*", "match_mapping_type": "string", "mapping": {"type": "keyword"}}}
|
|
],
|
|
},
|
|
"settings": {
|
|
"analysis": {"analyzer": {"default": {"type": "standard"}}},
|
|
"index": {"knn": True, "knn.algo_param.ef_search": 20},
|
|
},
|
|
}
|
|
assert mocked_document_store.knn_engine == "nmslib"
|
|
assert mocked_document_store.space_type == "innerproduct"
|
|
|
|
@pytest.mark.unit
|
|
def test__create_document_index_no_index_no_mapping_faiss(self, mocked_document_store):
|
|
mocked_document_store.knn_engine = "faiss"
|
|
mocked_document_store._create_document_index(self.index_name)
|
|
_, kwargs = mocked_document_store.client.indices.create.call_args
|
|
assert kwargs["body"] == {
|
|
"mappings": {
|
|
"dynamic_templates": [
|
|
{"strings": {"mapping": {"type": "keyword"}, "match_mapping_type": "string", "path_match": "*"}}
|
|
],
|
|
"properties": {
|
|
"content": {"type": "text"},
|
|
"embedding": {
|
|
"dimension": 768,
|
|
"method": {
|
|
"engine": "faiss",
|
|
"name": "hnsw",
|
|
"parameters": {"ef_construction": 512, "m": 16},
|
|
"space_type": "innerproduct",
|
|
},
|
|
"type": "knn_vector",
|
|
},
|
|
"name": {"type": "keyword"},
|
|
},
|
|
},
|
|
"settings": {"analysis": {"analyzer": {"default": {"type": "standard"}}}, "index": {"knn": True}},
|
|
}
|
|
|
|
@pytest.mark.unit
|
|
def test__create_document_index_client_failure(self, mocked_document_store):
|
|
mocked_document_store.client.indices.exists.return_value = False
|
|
mocked_document_store.client.indices.create.side_effect = RequestError
|
|
|
|
with pytest.raises(RequestError):
|
|
mocked_document_store._create_document_index(self.index_name)
|
|
|
|
@pytest.mark.unit
|
|
def test__get_embedding_field_mapping_flat(self, mocked_document_store):
|
|
mocked_document_store.index_type = "flat"
|
|
|
|
assert mocked_document_store._get_embedding_field_mapping() == {
|
|
"type": "knn_vector",
|
|
"dimension": 768,
|
|
"method": {
|
|
"space_type": "innerproduct",
|
|
"name": "hnsw",
|
|
"engine": "nmslib",
|
|
"parameters": {"ef_construction": 512, "m": 16},
|
|
},
|
|
}
|
|
|
|
@pytest.mark.unit
|
|
def test__get_embedding_field_mapping_default_hnsw(self, mocked_document_store):
|
|
mocked_document_store.index_type = "hnsw"
|
|
|
|
assert mocked_document_store._get_embedding_field_mapping() == {
|
|
"type": "knn_vector",
|
|
"dimension": 768,
|
|
"method": {
|
|
"space_type": "innerproduct",
|
|
"name": "hnsw",
|
|
"engine": "nmslib",
|
|
"parameters": {"ef_construction": 80, "m": 64},
|
|
},
|
|
}
|
|
|
|
@pytest.mark.unit
|
|
def test__get_embedding_field_mapping_default_hnsw_faiss(self, mocked_document_store):
|
|
mocked_document_store.index_type = "hnsw"
|
|
mocked_document_store.knn_engine = "faiss"
|
|
|
|
assert mocked_document_store._get_embedding_field_mapping() == {
|
|
"type": "knn_vector",
|
|
"dimension": 768,
|
|
"method": {
|
|
"space_type": "innerproduct",
|
|
"name": "hnsw",
|
|
"engine": "faiss",
|
|
"parameters": {"ef_construction": 80, "m": 64, "ef_search": 20},
|
|
},
|
|
}
|
|
|
|
@pytest.mark.unit
|
|
def test__get_embedding_field_mapping_custom_hnsw(self, mocked_document_store):
|
|
mocked_document_store.index_type = "hnsw"
|
|
mocked_document_store.knn_parameters = {"ef_construction": 1, "m": 2}
|
|
|
|
assert mocked_document_store._get_embedding_field_mapping() == {
|
|
"type": "knn_vector",
|
|
"dimension": 768,
|
|
"method": {
|
|
"space_type": "innerproduct",
|
|
"engine": "nmslib",
|
|
"name": "hnsw",
|
|
"parameters": {"ef_construction": 1, "m": 2},
|
|
},
|
|
}
|
|
|
|
@pytest.mark.unit
|
|
def test__get_embedding_field_mapping_custom_hnsw_faiss(self, mocked_document_store):
|
|
mocked_document_store.index_type = "hnsw"
|
|
mocked_document_store.knn_engine = "faiss"
|
|
mocked_document_store.knn_parameters = {"ef_construction": 1, "m": 2, "ef_search": 3}
|
|
|
|
assert mocked_document_store._get_embedding_field_mapping() == {
|
|
"type": "knn_vector",
|
|
"dimension": 768,
|
|
"method": {
|
|
"space_type": "innerproduct",
|
|
"engine": "faiss",
|
|
"name": "hnsw",
|
|
"parameters": {"ef_construction": 1, "m": 2, "ef_search": 3},
|
|
},
|
|
}
|
|
|
|
@pytest.mark.unit
|
|
def test__get_embedding_field_mapping_ivf(self, mocked_document_store):
|
|
mocked_document_store.index_type = "ivf"
|
|
mocked_document_store.knn_engine = "faiss"
|
|
mocked_document_store.client.indices.exists.return_value = False
|
|
|
|
# Before training, IVF indices use HNSW with default settings
|
|
assert mocked_document_store._get_embedding_field_mapping() == {"type": "knn_vector", "dimension": 768}
|
|
|
|
# Assume we have trained the index
|
|
mocked_document_store.client.indices.exists.return_value = True
|
|
mocked_document_store.client.transport.perform_request.return_value = {
|
|
"took": 4,
|
|
"timed_out": False,
|
|
"_shards": {"total": 1, "successful": 1, "skipped": 0, "failed": 0},
|
|
"hits": {
|
|
"total": {"value": 1, "relation": "eq"},
|
|
"max_score": 1.0,
|
|
"hits": [
|
|
{
|
|
"_index": ".opensearch-knn-models",
|
|
"_type": "_doc",
|
|
"_id": "document-ivf",
|
|
"_score": 1.0,
|
|
"_source": {
|
|
"model_blob": "<SOME MODEL BLOB>",
|
|
"engine": "faiss",
|
|
"space_type": "innerproduct",
|
|
"description": "index_type:ivf nlist:4 nprobes:1",
|
|
"model_id": f"{mocked_document_store.index}-ivf",
|
|
"state": "created",
|
|
"error": "",
|
|
"dimension": 768,
|
|
"timestamp": "2023-01-25T16:04:21.284398Z",
|
|
},
|
|
}
|
|
],
|
|
},
|
|
}
|
|
assert mocked_document_store._get_embedding_field_mapping() == {
|
|
"type": "knn_vector",
|
|
"model_id": f"{mocked_document_store.index}-ivf",
|
|
}
|
|
|
|
@pytest.mark.unit
|
|
def test__get_embedding_field_mapping_ivfpq(self, mocked_document_store):
|
|
mocked_document_store.index_type = "ivf_pq"
|
|
mocked_document_store.knn_engine = "faiss"
|
|
mocked_document_store.client.indices.exists.return_value = False
|
|
|
|
# Before training, IVF indices use HNSW with default settings
|
|
assert mocked_document_store._get_embedding_field_mapping() == {"type": "knn_vector", "dimension": 768}
|
|
|
|
# Assume we have trained the index
|
|
mocked_document_store.client.indices.exists.return_value = True
|
|
mocked_document_store.client.transport.perform_request.return_value = {
|
|
"took": 4,
|
|
"timed_out": False,
|
|
"_shards": {"total": 1, "successful": 1, "skipped": 0, "failed": 0},
|
|
"hits": {
|
|
"total": {"value": 1, "relation": "eq"},
|
|
"max_score": 1.0,
|
|
"hits": [
|
|
{
|
|
"_index": ".opensearch-knn-models",
|
|
"_type": "_doc",
|
|
"_id": "document-ivf",
|
|
"_score": 1.0,
|
|
"_source": {
|
|
"model_blob": "<SOME MODEL BLOB>",
|
|
"engine": "faiss",
|
|
"space_type": "innerproduct",
|
|
"description": "index_type:ivf_pq nlist:4 nprobes:1 m:1 code_size:8",
|
|
"model_id": f"{mocked_document_store.index}-ivf",
|
|
"state": "created",
|
|
"error": "",
|
|
"dimension": 768,
|
|
"timestamp": "2023-01-25T16:04:21.284398Z",
|
|
},
|
|
}
|
|
],
|
|
},
|
|
}
|
|
assert mocked_document_store._get_embedding_field_mapping() == {
|
|
"type": "knn_vector",
|
|
"model_id": f"{mocked_document_store.index}-ivf",
|
|
}
|
|
|
|
@pytest.mark.unit
|
|
def test__get_embedding_field_mapping_wrong(self, mocked_document_store, caplog):
|
|
mocked_document_store.index_type = "foo"
|
|
|
|
with caplog.at_level(logging.ERROR, logger="haystack.document_stores.opensearch"):
|
|
retval = mocked_document_store._get_embedding_field_mapping()
|
|
|
|
assert "Set index_type to either 'flat', 'hnsw', 'ivf', or 'ivf_pq'" in caplog.text
|
|
assert retval == {
|
|
"type": "knn_vector",
|
|
"dimension": 768,
|
|
"method": {"space_type": "innerproduct", "name": "hnsw", "engine": "nmslib"},
|
|
}
|
|
|
|
@pytest.mark.unit
|
|
def test__create_label_index_already_exists(self, mocked_document_store):
|
|
mocked_document_store.client.indices.exists.return_value = True
|
|
|
|
mocked_document_store._init_indices("doc_index", "label_index", True, False)
|
|
mocked_document_store.client.indices.create.assert_not_called()
|
|
|
|
@pytest.mark.unit
|
|
def test__create_label_index_client_error(self, mocked_document_store):
|
|
mocked_document_store.client.indices.exists.return_value = False
|
|
mocked_document_store.client.indices.create.side_effect = RequestError
|
|
|
|
with pytest.raises(RequestError):
|
|
mocked_document_store._create_label_index("foo")
|
|
|
|
@pytest.mark.unit
|
|
def test__get_vector_similarity_query_support_true(self, mocked_document_store):
|
|
mocked_document_store.embedding_field = "FooField"
|
|
assert mocked_document_store.knn_engine != "score_script"
|
|
|
|
assert mocked_document_store._get_vector_similarity_query(self.query_emb, 3) == {
|
|
"bool": {"must": [{"knn": {"FooField": {"vector": self.query_emb.tolist(), "k": 3}}}]}
|
|
}
|
|
|
|
@pytest.mark.unit
|
|
def test__get_vector_similarity_query_support_false(self, mocked_document_store):
|
|
mocked_document_store.embedding_field = "FooField"
|
|
mocked_document_store.knn_engine = "score_script"
|
|
mocked_document_store.space_type = "innerproduct"
|
|
|
|
assert mocked_document_store._get_vector_similarity_query(self.query_emb, 3) == {
|
|
"script_score": {
|
|
"query": {"match_all": {}},
|
|
"script": {
|
|
"source": "knn_score",
|
|
"lang": "knn",
|
|
"params": {
|
|
"field": "FooField",
|
|
"query_value": self.query_emb.tolist(),
|
|
"space_type": "innerproduct",
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
@pytest.mark.unit
|
|
def test__get_raw_similarity_score_dot(self, mocked_document_store):
|
|
mocked_document_store.similarity = "dot_product"
|
|
assert mocked_document_store._get_raw_similarity_score(2) == 1
|
|
assert mocked_document_store._get_raw_similarity_score(-2) == 1.5
|
|
|
|
@pytest.mark.unit
|
|
def test__get_raw_similarity_score_l2(self, mocked_document_store):
|
|
mocked_document_store.similarity = "l2"
|
|
assert mocked_document_store._get_raw_similarity_score(1) == 0
|
|
|
|
@pytest.mark.unit
|
|
def test__get_raw_similarity_score_cosine(self, mocked_document_store):
|
|
mocked_document_store.space_type = "cosinesimil"
|
|
assert mocked_document_store.knn_engine != "score_script"
|
|
assert mocked_document_store._get_raw_similarity_score(1) == 1
|
|
mocked_document_store.knn_engine = "score_script"
|
|
assert mocked_document_store._get_raw_similarity_score(1) == 0
|
|
|
|
@pytest.mark.unit
|
|
def test_clone_embedding_field_duplicate_mapping(self, mocked_document_store):
|
|
mocked_document_store.index = self.index_name
|
|
with pytest.raises(Exception, match="embedding already exists with mapping"):
|
|
mocked_document_store.clone_embedding_field("embedding", "cosine")
|
|
|
|
@pytest.mark.unit
|
|
def test_clone_embedding_field_update_mapping(self, mocked_document_store, monkeypatch):
|
|
mocked_document_store.index = self.index_name
|
|
|
|
# Mock away tqdm and the batch logic so we can test the mapping update alone
|
|
mocked_document_store._get_all_documents_in_index = MagicMock(return_value=[])
|
|
monkeypatch.setattr(tqdm, "__new__", MagicMock())
|
|
|
|
mocked_document_store.clone_embedding_field("a_field", "cosine")
|
|
_, kwargs = mocked_document_store.client.indices.put_mapping.call_args
|
|
assert kwargs["body"]["properties"]["a_field"] == {
|
|
"type": "knn_vector",
|
|
"dimension": 768,
|
|
"method": {
|
|
"space_type": "cosinesimil",
|
|
"name": "hnsw",
|
|
"engine": "nmslib",
|
|
"parameters": {"ef_construction": 512, "m": 16},
|
|
},
|
|
}
|
|
|
|
@pytest.mark.unit
|
|
def test_bulk_write_retries_for_always_failing_insert_is_canceled(self, mocked_document_store, monkeypatch, caplog):
|
|
docs_to_write = [
|
|
{"meta": {"name": f"name_{i}"}, "content": f"text_{i}", "embedding": np.random.rand(768).astype(np.float32)}
|
|
for i in range(1000)
|
|
]
|
|
|
|
with patch("haystack.document_stores.opensearch.bulk") as mocked_bulk:
|
|
mocked_bulk.side_effect = opensearchpy.TransportError(429, "Too many requests")
|
|
|
|
with pytest.raises(DocumentStoreError, match="Last try of bulk indexing documents failed."):
|
|
mocked_document_store._bulk(documents=docs_to_write, _timeout=0, _remaining_tries=3)
|
|
|
|
assert mocked_bulk.call_count == 3 # depth first search fails and cancels the whole bulk request
|
|
|
|
assert "Too Many Requests" in caplog.text
|
|
assert " Splitting the number of documents into two chunks with the same size" in caplog.text
|
|
|
|
@pytest.mark.unit
|
|
def test_bulk_write_retries_with_backoff_with_smaller_batch_size_on_too_many_requests(
|
|
self, mocked_document_store, monkeypatch
|
|
):
|
|
docs_to_write = [
|
|
{"meta": {"name": f"name_{i}"}, "content": f"text_{i}", "embedding": np.random.rand(768).astype(np.float32)}
|
|
for i in range(1000)
|
|
]
|
|
|
|
with patch("haystack.document_stores.opensearch.bulk") as mocked_bulk:
|
|
# make bulk insert split documents and request retries s.t.
|
|
# 1k => 500 (failed) + 500 (successful) => 250 (successful) + 250 (successful)
|
|
# resulting in 5 calls in total
|
|
mocked_bulk.side_effect = [
|
|
opensearchpy.TransportError(429, "Too many requests"),
|
|
opensearchpy.TransportError(429, "Too many requests"),
|
|
None,
|
|
None,
|
|
None,
|
|
]
|
|
mocked_document_store._bulk(documents=docs_to_write, _timeout=0, _remaining_tries=3)
|
|
assert mocked_bulk.call_count == 5
|
|
|
|
@pytest.mark.unit
|
|
def test_get_document_by_id_return_embedding_false(self, mocked_document_store):
|
|
mocked_document_store.return_embedding = False
|
|
mocked_document_store.get_document_by_id("123")
|
|
# assert the resulting body is consistent with the `excluded_meta_data` value
|
|
_, kwargs = mocked_document_store.client.search.call_args
|
|
assert kwargs["body"]["_source"] == {"excludes": ["embedding"]}
|
|
|
|
@pytest.mark.unit
|
|
def test_get_document_by_id_excluded_meta_data_has_no_influence(self, mocked_document_store):
|
|
mocked_document_store.excluded_meta_data = ["foo"]
|
|
mocked_document_store.return_embedding = False
|
|
mocked_document_store.get_document_by_id("123")
|
|
# assert the resulting body is not affected by the `excluded_meta_data` value
|
|
_, kwargs = mocked_document_store.client.search.call_args
|
|
assert kwargs["body"]["_source"] == {"excludes": ["embedding"]}
|
|
|
|
@pytest.mark.unit
|
|
def test_write_documents_req_for_each_batch(self, mocked_document_store, documents):
|
|
mocked_document_store.batch_size = 2
|
|
with patch("haystack.document_stores.opensearch.bulk") as mocked_bulk:
|
|
mocked_document_store.write_documents(documents)
|
|
assert mocked_bulk.call_count == 5
|