2022-10-31 15:30:14 +01:00
import os
2022-07-28 10:04:49 +02:00
import logging
2022-09-13 14:30:30 +01:00
from unittest . mock import MagicMock , patch
2022-06-08 16:37:23 +02:00
import pytest
2022-07-28 10:04:49 +02:00
import numpy as np
2022-09-13 14:30:30 +01:00
import opensearchpy
2022-07-28 10:04:49 +02:00
from haystack . document_stores . opensearch import (
OpenSearch ,
OpenSearchDocumentStore ,
RequestsHttpConnection ,
Urllib3HttpConnection ,
RequestError ,
tqdm ,
)
from haystack . errors import DocumentStoreError
2023-02-17 19:38:03 +01:00
from haystack . testing import DocumentStoreBaseTestAbstract
2022-06-08 16:37:23 +02:00
2022-10-31 15:30:14 +01:00
from . test_search_engine import SearchEngineDocumentStoreTestAbstract
class TestOpenSearchDocumentStore ( DocumentStoreBaseTestAbstract , SearchEngineDocumentStoreTestAbstract ) :
2022-07-28 10:04:49 +02:00
# Constants
2022-08-22 13:19:45 +02:00
query_emb = np . random . random_sample ( size = ( 2 , 2 ) )
2022-10-31 15:30:14 +01:00
index_name = __name__
2022-07-28 10:04:49 +02:00
# Fixtures
@pytest.fixture
def ds ( self ) :
"""
2023-04-18 15:40:17 +02:00
This fixture provides a working document store and takes care of keeping clean the
OS cluster used in the tests .
2022-07-28 10:04:49 +02:00
"""
2022-10-31 15:30:14 +01:00
labels_index_name = f " { self . index_name } _labels "
ds = OpenSearchDocumentStore (
index = self . index_name ,
label_index = labels_index_name ,
host = os . environ . get ( " OPENSEARCH_HOST " , " localhost " ) ,
create_index = True ,
2023-04-18 15:40:17 +02:00
recreate_index = True ,
2022-10-31 15:30:14 +01:00
)
2023-04-18 15:40:17 +02:00
2022-07-28 10:04:49 +02:00
yield ds
@pytest.fixture
2022-12-27 15:24:31 +01:00
def mocked_document_store ( self , existing_index ) :
2022-07-28 10:04:49 +02:00
"""
The fixture provides an instance of a slightly customized
OpenSearchDocumentStore equipped with a mocked client
"""
class DSMock ( OpenSearchDocumentStore ) :
# We mock a subclass to avoid messing up the actual class object
pass
2022-12-27 15:24:31 +01:00
opensearch_mock = MagicMock ( )
opensearch_mock . indices . exists . return_value = True
opensearch_mock . indices . get . return_value = { self . index_name : existing_index }
2023-07-13 14:50:43 +02:00
opensearch_mock . info . return_value = { " version " : { " number " : " 1.3.5 " } }
2022-07-28 10:04:49 +02:00
DSMock . _init_client = MagicMock ( )
2022-12-27 15:24:31 +01:00
DSMock . _init_client . configure_mock ( return_value = opensearch_mock )
dsMock = DSMock ( )
return dsMock
2022-07-28 10:04:49 +02:00
@pytest.fixture
def mocked_open_search_init ( self , monkeypatch ) :
mocked_init = MagicMock ( return_value = None )
monkeypatch . setattr ( OpenSearch , " __init__ " , mocked_init )
return mocked_init
@pytest.fixture
def _init_client_params ( self ) :
"""
The fixture provides the required arguments to call OpenSearchDocumentStore . _init_client
"""
return {
" host " : " localhost " ,
" port " : 9999 ,
" username " : " user " ,
" password " : " pass " ,
" aws4auth " : None ,
" scheme " : " http " ,
" ca_certs " : " ca_certs " ,
" verify_certs " : True ,
" timeout " : 42 ,
" use_system_proxy " : True ,
}
@pytest.fixture
2022-12-27 15:24:31 +01:00
def existing_index ( self ) :
2022-07-28 10:04:49 +02:00
return {
" aliases " : { } ,
" mappings " : {
" properties " : {
2022-12-27 15:24:31 +01:00
" content " : { " type " : " text " } ,
" embedding " : {
2022-07-28 10:04:49 +02:00
" type " : " knn_vector " ,
" dimension " : 768 ,
" method " : {
" engine " : " nmslib " ,
" space_type " : " innerproduct " ,
" name " : " hnsw " ,
" parameters " : { " ef_construction " : 512 , " m " : 16 } ,
} ,
} ,
}
} ,
" settings " : {
" index " : {
" creation_date " : " 1658337984559 " ,
" number_of_shards " : " 1 " ,
" number_of_replicas " : " 1 " ,
" uuid " : " jU5KPBtXQHOaIn2Cm2d4jg " ,
" version " : { " created " : " 135238227 " } ,
2022-12-27 15:24:31 +01:00
" provided_name " : " existing_index " ,
2022-07-28 10:04:49 +02:00
}
} ,
}
# Integration tests
@pytest.mark.integration
def test___init__ ( self ) :
2022-12-27 15:24:31 +01:00
OpenSearchDocumentStore ( index = " nmslib_index " , create_index = True )
2022-07-28 10:04:49 +02:00
2022-08-24 16:43:48 +02:00
@pytest.mark.integration
2023-02-17 10:28:36 +01:00
@pytest.mark.parametrize ( " index_type " , [ " flat " , " hnsw " , " ivf " , " ivf_pq " ] )
def test___init___faiss ( self , index_type ) :
OpenSearchDocumentStore (
index = f " faiss_index_ { index_type } " , recreate_index = True , knn_engine = " faiss " , index_type = index_type
)
2022-07-28 10:04:49 +02:00
2022-12-27 15:24:31 +01:00
@pytest.mark.integration
def test___init___score_script ( self ) :
OpenSearchDocumentStore ( index = " score_script_index " , create_index = True , knn_engine = " score_script " )
2022-07-28 10:04:49 +02:00
@pytest.mark.integration
def test_recreate_index ( self , ds , documents , labels ) :
ds . write_documents ( documents )
ds . write_labels ( labels )
# Create another document store on top of the previous one
2022-10-31 15:30:14 +01:00
ds = OpenSearchDocumentStore ( index = ds . index , label_index = ds . label_index , recreate_index = True )
2022-07-28 10:04:49 +02:00
assert len ( ds . get_all_documents ( index = ds . index ) ) == 0
assert len ( ds . get_all_labels ( index = ds . label_index ) ) == 0
@pytest.mark.integration
def test_clone_embedding_field ( self , ds , documents ) :
cloned_field_name = " cloned "
ds . write_documents ( documents )
ds . clone_embedding_field ( cloned_field_name , " cosine " )
for doc in ds . get_all_documents ( ) :
meta = doc . to_dict ( ) [ " meta " ]
if " no_embedding " in meta :
# docs with no embedding should be ignored
assert cloned_field_name not in meta
else :
# docs with an original embedding should have the new one
assert cloned_field_name in meta
2022-08-25 17:33:26 +02:00
@pytest.mark.integration
2022-12-27 15:24:31 +01:00
@pytest.mark.parametrize ( " knn_engine " , [ " nmslib " , " faiss " , " score_script " ] )
def test_query_embedding_with_filters ( self , ds : OpenSearchDocumentStore , documents , knn_engine ) :
# Create another document store on top of the previous one
ds = OpenSearchDocumentStore (
index = ds . index , label_index = ds . label_index , recreate_index = True , knn_engine = knn_engine
)
2022-10-06 15:41:29 +02:00
ds . write_documents ( documents )
results = ds . query_by_embedding (
query_emb = np . random . rand ( 768 ) . astype ( np . float32 ) , filters = { " year " : " 2020 " } , top_k = 10
)
assert len ( results ) == 3
2022-12-08 08:28:43 +01:00
@pytest.mark.integration
@pytest.mark.parametrize ( " use_ann " , [ True , False ] )
def test_query_embedding_batch_with_filters ( self , ds : OpenSearchDocumentStore , documents , use_ann ) :
ds . embeddings_field_supports_similarity = use_ann
ds . write_documents ( documents )
results = ds . query_by_embedding_batch (
query_embs = [ np . random . rand ( 768 ) . astype ( np . float32 ) for _ in range ( 2 ) ] ,
filters = [ { " year " : " 2020 " } for _ in range ( 2 ) ] ,
top_k = 10 ,
)
assert len ( results ) == 2
for result in results :
assert len ( result ) == 3
2023-02-17 10:28:36 +01:00
@pytest.mark.integration
@pytest.mark.parametrize ( " index_type " , [ " ivf " , " ivf_pq " ] )
def test_train_index_from_documents ( self , ds : OpenSearchDocumentStore , documents , index_type ) :
# Create another document store on top of the previous one
ds = OpenSearchDocumentStore (
index = ds . index ,
label_index = ds . label_index ,
recreate_index = True ,
knn_engine = " faiss " ,
index_type = index_type ,
knn_parameters = { " code_size " : 2 } ,
)
# Check that IVF indices use score_script before training
emb_field_settings = ds . client . indices . get ( ds . index ) [ ds . index ] [ " mappings " ] [ " properties " ] [ ds . embedding_field ]
assert emb_field_settings == { " type " : " knn_vector " , " dimension " : 768 }
ds . train_index ( documents )
# Check that embedding_field_settings have been updated
emb_field_settings = ds . client . indices . get ( ds . index ) [ ds . index ] [ " mappings " ] [ " properties " ] [ ds . embedding_field ]
assert emb_field_settings == { " type " : " knn_vector " , " model_id " : f " { ds . index } -ivf " }
# Check that model uses expected parameters
expected_model_settigns = { " index_type " : index_type , " nlist " : 4 , " nprobes " : 1 }
if index_type == " ivf_pq " :
expected_model_settigns [ " code_size " ] = 2
expected_model_settigns [ " m " ] = 1
model_endpoint = f " /_plugins/_knn/models/ { ds . index } -ivf "
response = ds . client . transport . perform_request ( " GET " , url = model_endpoint )
model_settings_list = [ setting . split ( " : " ) for setting in response [ " description " ] . split ( ) ]
model_settings = { k : ( int ( v ) if v . isnumeric ( ) else v ) for k , v in model_settings_list }
assert model_settings == expected_model_settigns
@pytest.mark.integration
@pytest.mark.parametrize ( " index_type " , [ " ivf " , " ivf_pq " ] )
def test_train_index_from_embeddings ( self , ds : OpenSearchDocumentStore , documents , index_type ) :
# Create another document store on top of the previous one
ds = OpenSearchDocumentStore (
index = ds . index ,
label_index = ds . label_index ,
recreate_index = True ,
knn_engine = " faiss " ,
index_type = index_type ,
knn_parameters = { " code_size " : 2 } ,
)
# Check that IVF indices use HNSW with default settings before training
emb_field_settings = ds . client . indices . get ( ds . index ) [ ds . index ] [ " mappings " ] [ " properties " ] [ ds . embedding_field ]
assert emb_field_settings == { " type " : " knn_vector " , " dimension " : 768 }
embeddings = np . array ( [ doc . embedding for doc in documents if doc . embedding is not None ] )
ds . train_index ( embeddings = embeddings )
# Check that embedding_field_settings have been updated
emb_field_settings = ds . client . indices . get ( ds . index ) [ ds . index ] [ " mappings " ] [ " properties " ] [ ds . embedding_field ]
assert emb_field_settings == { " type " : " knn_vector " , " model_id " : f " { ds . index } -ivf " }
# Check that model uses expected parameters
expected_model_settigns = { " index_type " : index_type , " nlist " : 4 , " nprobes " : 1 }
if index_type == " ivf_pq " :
expected_model_settigns [ " code_size " ] = 2
expected_model_settigns [ " m " ] = 1
model_endpoint = f " /_plugins/_knn/models/ { ds . index } -ivf "
response = ds . client . transport . perform_request ( " GET " , url = model_endpoint )
model_settings_list = [ setting . split ( " : " ) for setting in response [ " description " ] . split ( ) ]
model_settings = { k : ( int ( v ) if v . isnumeric ( ) else v ) for k , v in model_settings_list }
assert model_settings == expected_model_settigns
@pytest.mark.integration
@pytest.mark.parametrize ( " index_type " , [ " ivf " , " ivf_pq " ] )
def test_train_index_with_write_documents ( self , ds : OpenSearchDocumentStore , documents , index_type ) :
# Create another document store on top of the previous one
ds = OpenSearchDocumentStore (
index = ds . index ,
label_index = ds . label_index ,
recreate_index = True ,
knn_engine = " faiss " ,
index_type = index_type ,
knn_parameters = { " code_size " : 2 } ,
ivf_train_size = 6 ,
)
# Check that IVF indices use HNSW with default settings before training
emb_field_settings = ds . client . indices . get ( ds . index ) [ ds . index ] [ " mappings " ] [ " properties " ] [ ds . embedding_field ]
assert emb_field_settings == { " type " : " knn_vector " , " dimension " : 768 }
ds . write_documents ( documents )
# Check that embedding_field_settings have been updated
emb_field_settings = ds . client . indices . get ( ds . index ) [ ds . index ] [ " mappings " ] [ " properties " ] [ ds . embedding_field ]
assert emb_field_settings == { " type " : " knn_vector " , " model_id " : f " { ds . index } -ivf " }
# Check that model uses expected parameters
expected_model_settigns = { " index_type " : index_type , " nlist " : 4 , " nprobes " : 1 }
if index_type == " ivf_pq " :
expected_model_settigns [ " code_size " ] = 2
expected_model_settigns [ " m " ] = 1
model_endpoint = f " /_plugins/_knn/models/ { ds . index } -ivf "
response = ds . client . transport . perform_request ( " GET " , url = model_endpoint )
model_settings_list = [ setting . split ( " : " ) for setting in response [ " description " ] . split ( ) ]
model_settings = { k : ( int ( v ) if v . isnumeric ( ) else v ) for k , v in model_settings_list }
assert model_settings == expected_model_settigns
2022-07-28 10:04:49 +02:00
# Unit tests
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test___init___api_key_raises_warning ( self , mocked_document_store , caplog ) :
with caplog . at_level ( logging . WARN , logger = " haystack.document_stores.opensearch " ) :
mocked_document_store . __init__ ( api_key = " foo " )
mocked_document_store . __init__ ( api_key_id = " bar " )
mocked_document_store . __init__ ( api_key = " foo " , api_key_id = " bar " )
assert len ( caplog . records ) == 3
for r in caplog . records :
assert r . levelname == " WARNING "
2023-01-24 10:01:39 +01:00
@pytest.mark.unit
def test__init_client_aws4auth_and_username_raises_warning ( self , mocked_open_search_init , caplog ) :
_init_client_remaining_kwargs = {
" host " : " host " ,
" port " : 443 ,
" password " : " pass " ,
" scheme " : " https " ,
" ca_certs " : None ,
" verify_certs " : True ,
" timeout " : 10 ,
" use_system_proxy " : False ,
}
with caplog . at_level ( logging . WARN , logger = " haystack.document_stores.opensearch " ) :
OpenSearchDocumentStore . _init_client ( username = " admin " , aws4auth = " foo " , * * _init_client_remaining_kwargs )
OpenSearchDocumentStore . _init_client ( username = " bar " , aws4auth = " foo " , * * _init_client_remaining_kwargs )
assert len ( caplog . records ) == 2
for r in caplog . records :
assert r . levelname == " WARNING "
caplog . clear ( )
with caplog . at_level ( logging . WARN , logger = " haystack.document_stores.opensearch " ) :
OpenSearchDocumentStore . _init_client ( username = None , aws4auth = " foo " , * * _init_client_remaining_kwargs )
OpenSearchDocumentStore . _init_client ( username = " foo " , aws4auth = None , * * _init_client_remaining_kwargs )
assert len ( caplog . records ) == 0
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test___init___connection_test_fails ( self , mocked_document_store ) :
failing_client = MagicMock ( )
failing_client . indices . get . side_effect = Exception ( " The client failed! " )
mocked_document_store . _init_client . return_value = failing_client
with pytest . raises ( ConnectionError ) :
mocked_document_store . __init__ ( )
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test___init___client_params ( self , mocked_open_search_init , _init_client_params ) :
"""
Ensure the Opensearch - py client was initialized with the right params
"""
OpenSearchDocumentStore . _init_client ( * * _init_client_params )
assert mocked_open_search_init . called
_ , kwargs = mocked_open_search_init . call_args
assert kwargs == {
" hosts " : [ { " host " : " localhost " , " port " : 9999 } ] ,
" http_auth " : ( " user " , " pass " ) ,
" scheme " : " http " ,
" ca_certs " : " ca_certs " ,
" verify_certs " : True ,
" timeout " : 42 ,
" connection_class " : RequestsHttpConnection ,
}
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__init_client_use_system_proxy_use_sys_proxy ( self , mocked_open_search_init , _init_client_params ) :
_init_client_params [ " use_system_proxy " ] = False
OpenSearchDocumentStore . _init_client ( * * _init_client_params )
_ , kwargs = mocked_open_search_init . call_args
assert kwargs [ " connection_class " ] == Urllib3HttpConnection
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__init_client_use_system_proxy_dont_use_sys_proxy ( self , mocked_open_search_init , _init_client_params ) :
_init_client_params [ " use_system_proxy " ] = True
OpenSearchDocumentStore . _init_client ( * * _init_client_params )
_ , kwargs = mocked_open_search_init . call_args
assert kwargs [ " connection_class " ] == RequestsHttpConnection
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__init_client_auth_methods_username_password ( self , mocked_open_search_init , _init_client_params ) :
_init_client_params [ " username " ] = " user "
_init_client_params [ " aws4auth " ] = None
OpenSearchDocumentStore . _init_client ( * * _init_client_params )
_ , kwargs = mocked_open_search_init . call_args
assert kwargs [ " http_auth " ] == ( " user " , " pass " )
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__init_client_auth_methods_aws_iam ( self , mocked_open_search_init , _init_client_params ) :
_init_client_params [ " username " ] = " "
_init_client_params [ " aws4auth " ] = " foo "
OpenSearchDocumentStore . _init_client ( * * _init_client_params )
_ , kwargs = mocked_open_search_init . call_args
assert kwargs [ " http_auth " ] == " foo "
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__init_client_auth_methods_no_auth ( self , mocked_open_search_init , _init_client_params ) :
_init_client_params [ " username " ] = " "
_init_client_params [ " aws4auth " ] = None
OpenSearchDocumentStore . _init_client ( * * _init_client_params )
_ , kwargs = mocked_open_search_init . call_args
assert " http_auth " not in kwargs
2023-04-26 21:58:33 +02:00
@pytest.mark.unit
def test_query ( self , mocked_document_store ) :
mocked_document_store . query ( query = self . query )
kwargs = mocked_document_store . client . search . call_args . kwargs
assert " index " in kwargs
assert " body " in kwargs
assert " headers " in kwargs
@pytest.mark.unit
def test_query_return_embedding_false ( self , mocked_document_store ) :
mocked_document_store . return_embedding = False
mocked_document_store . query ( self . query )
# assert the resulting body is consistent with the `excluded_meta_data` value
_ , kwargs = mocked_document_store . client . search . call_args
assert kwargs [ " body " ] [ " _source " ] == { " excludes " : [ " embedding " ] }
@pytest.mark.unit
def test_query_excluded_meta_data_return_embedding_true ( self , mocked_document_store ) :
mocked_document_store . return_embedding = True
mocked_document_store . excluded_meta_data = [ " foo " , " embedding " ]
mocked_document_store . query ( self . query )
_ , kwargs = mocked_document_store . client . search . call_args
# we expect "embedding" was removed from the final query
assert kwargs [ " body " ] [ " _source " ] == { " excludes " : [ " foo " ] }
@pytest.mark.unit
def test_query_excluded_meta_data_return_embedding_false ( self , mocked_document_store ) :
mocked_document_store . return_embedding = False
mocked_document_store . excluded_meta_data = [ " foo " ]
mocked_document_store . query ( self . query )
# assert the resulting body is consistent with the `excluded_meta_data` value
_ , kwargs = mocked_document_store . client . search . call_args
assert kwargs [ " body " ] [ " _source " ] == { " excludes " : [ " foo " , " embedding " ] }
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test_query_by_embedding_raises_if_missing_field ( self , mocked_document_store ) :
mocked_document_store . embedding_field = " "
with pytest . raises ( DocumentStoreError ) :
mocked_document_store . query_by_embedding ( self . query_emb )
2023-02-17 10:28:36 +01:00
@pytest.mark.unit
def test_query_by_embedding_raises_if_ivf_untrained ( self , mocked_document_store ) :
mocked_document_store . index_type = " ivf "
mocked_document_store . ivf_train_size = 10
with pytest . raises ( DocumentStoreError , match = " Index of type ' ivf ' is not trained yet. " ) :
mocked_document_store . query_by_embedding ( self . query_emb )
@pytest.mark.unit
def test_query_by_embedding_batch_if_ivf_untrained ( self , mocked_document_store ) :
mocked_document_store . index_type = " ivf "
mocked_document_store . ivf_train_size = 10
with pytest . raises ( DocumentStoreError , match = " Index of type ' ivf ' is not trained yet. " ) :
mocked_document_store . query_by_embedding_batch ( [ self . query_emb ] )
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test_query_by_embedding_filters ( self , mocked_document_store ) :
2022-12-27 15:24:31 +01:00
assert mocked_document_store . knn_engine != " score_script "
2022-07-28 10:04:49 +02:00
expected_filters = { " type " : " article " , " date " : { " $gte " : " 2015-01-01 " , " $lt " : " 2021-01-01 " } }
mocked_document_store . query_by_embedding ( self . query_emb , filters = expected_filters )
# Assert the `search` method on the client was called with the filters we provided
_ , kwargs = mocked_document_store . client . search . call_args
actual_filters = kwargs [ " body " ] [ " query " ] [ " bool " ] [ " filter " ]
assert actual_filters [ " bool " ] [ " must " ] == [
{ " term " : { " type " : " article " } } ,
{ " range " : { " date " : { " gte " : " 2015-01-01 " , " lt " : " 2021-01-01 " } } } ,
]
2022-10-06 15:41:29 +02:00
@pytest.mark.unit
def test_query_by_embedding_script_score_filters ( self , mocked_document_store ) :
2022-12-27 15:24:31 +01:00
mocked_document_store . knn_engine = " score_script "
2022-10-06 15:41:29 +02:00
expected_filters = { " type " : " article " , " date " : { " $gte " : " 2015-01-01 " , " $lt " : " 2021-01-01 " } }
mocked_document_store . query_by_embedding ( self . query_emb , filters = expected_filters )
# Assert the `search` method on the client was called with the filters we provided
_ , kwargs = mocked_document_store . client . search . call_args
actual_filters = kwargs [ " body " ] [ " query " ] [ " script_score " ] [ " query " ] [ " bool " ] [ " filter " ]
assert actual_filters [ " bool " ] [ " must " ] == [
{ " term " : { " type " : " article " } } ,
{ " range " : { " date " : { " gte " : " 2015-01-01 " , " lt " : " 2021-01-01 " } } } ,
]
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test_query_by_embedding_return_embedding_false ( self , mocked_document_store ) :
mocked_document_store . return_embedding = False
mocked_document_store . query_by_embedding ( self . query_emb )
# assert the resulting body is consistent with the `excluded_meta_data` value
_ , kwargs = mocked_document_store . client . search . call_args
assert kwargs [ " body " ] [ " _source " ] == { " excludes " : [ " embedding " ] }
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test_query_by_embedding_excluded_meta_data_return_embedding_true ( self , mocked_document_store ) :
"""
Test that when ` return_embedding == True ` the field should NOT be excluded even if it
was added to ` excluded_meta_data `
"""
mocked_document_store . return_embedding = True
mocked_document_store . excluded_meta_data = [ " foo " , " embedding " ]
mocked_document_store . query_by_embedding ( self . query_emb )
_ , kwargs = mocked_document_store . client . search . call_args
# we expect "embedding" was removed from the final query
assert kwargs [ " body " ] [ " _source " ] == { " excludes " : [ " foo " ] }
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test_query_by_embedding_excluded_meta_data_return_embedding_false ( self , mocked_document_store ) :
"""
Test that when ` return_embedding == False ` , the final query excludes the ` embedding ` field
even if it wasn ' t explicitly added to `excluded_meta_data`
"""
mocked_document_store . return_embedding = False
mocked_document_store . excluded_meta_data = [ " foo " ]
mocked_document_store . query_by_embedding ( self . query_emb )
# assert the resulting body is consistent with the `excluded_meta_data` value
_ , kwargs = mocked_document_store . client . search . call_args
assert kwargs [ " body " ] [ " _source " ] == { " excludes " : [ " foo " , " embedding " ] }
2022-12-08 08:28:43 +01:00
@pytest.mark.unit
def test_query_by_embedding_batch_uses_msearch ( self , mocked_document_store ) :
mocked_document_store . query_by_embedding_batch ( [ self . query_emb for _ in range ( 10 ) ] )
# assert the resulting body is consistent with the `excluded_meta_data` value
_ , kwargs = mocked_document_store . client . msearch . call_args
assert len ( kwargs [ " body " ] ) == 20 # each search has headers and request
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test__init_indices_with_alias ( self , mocked_document_store , caplog ) :
2022-07-28 10:04:49 +02:00
mocked_document_store . client . indices . exists_alias . return_value = True
2022-12-27 15:24:31 +01:00
with caplog . at_level ( logging . DEBUG , logger = " haystack.document_stores.search_engine " ) :
mocked_document_store . _init_indices ( self . index_name , " labels " , False , False )
2022-07-28 10:04:49 +02:00
assert f " Index name { self . index_name } is an alias. " in caplog . text
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test__validate_and_adjust_document_index_wrong_mapping_raises ( self , mocked_document_store , existing_index ) :
2022-07-28 10:04:49 +02:00
"""
Ensure the method raises if we specify a field in ` search_fields ` that ' s not text
"""
2022-12-27 15:24:31 +01:00
existing_index [ " mappings " ] [ " properties " ] [ " age " ] = { " type " : " integer " }
2022-07-28 10:04:49 +02:00
mocked_document_store . search_fields = [ " age " ]
2022-12-27 15:24:31 +01:00
with pytest . raises (
DocumentStoreError ,
match = f " The index ' { self . index_name } ' needs the ' text ' type for the search_field ' age ' to run full text search, but got type ' integer ' . " ,
) :
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
2022-07-28 10:04:49 +02:00
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test__validate_and_adjust_document_index_create_embedding_mapping_if_missing ( self , mocked_document_store ) :
2022-07-28 10:04:49 +02:00
mocked_document_store . embedding_field = " doesnt_have_a_mapping "
2022-12-27 15:24:31 +01:00
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
2022-07-28 10:04:49 +02:00
# Assert the expected body was passed to the client
_ , kwargs = mocked_document_store . client . indices . put_mapping . call_args
assert kwargs [ " index " ] == self . index_name
2022-12-27 15:24:31 +01:00
assert kwargs [ " body " ] [ " properties " ] [ " doesnt_have_a_mapping " ] [ " type " ] == " knn_vector "
2022-07-28 10:04:49 +02:00
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test__validate_and_adjust_document_index_create_search_field_mapping_if_missing ( self , mocked_document_store ) :
mocked_document_store . search_fields = [ " doesnt_have_a_mapping " ]
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
# Assert the expected body was passed to the client
_ , kwargs = mocked_document_store . client . indices . put_mapping . call_args
assert kwargs [ " index " ] == self . index_name
assert kwargs [ " body " ] [ " properties " ] [ " doesnt_have_a_mapping " ] [ " type " ] == " text "
2022-07-28 10:04:49 +02:00
2022-12-27 15:24:31 +01:00
@pytest.mark.unit
def test__validate_and_adjust_document_index_with_bad_field_raises ( self , mocked_document_store , existing_index ) :
existing_index [ " mappings " ] [ " properties " ] [ " age " ] = { " type " : " integer " }
mocked_document_store . embedding_field = " age "
2022-07-28 10:04:49 +02:00
with pytest . raises (
2022-12-27 15:24:31 +01:00
DocumentStoreError ,
match = f " The index ' { self . index_name } ' needs the ' knn_vector ' type for the embedding_field ' age ' to run vector search, but got type ' integer ' . " ,
2022-07-28 10:04:49 +02:00
) :
2022-12-27 15:24:31 +01:00
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
2022-07-28 10:04:49 +02:00
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test__validate_and_adjust_document_index_but_no_method ( self , mocked_document_store , existing_index ) :
2022-07-28 10:04:49 +02:00
"""
We call the method passing a properly mapped field but without the ` method ` specified in the mapping
"""
2022-12-27 15:24:31 +01:00
del existing_index [ " mappings " ] [ " properties " ] [ " embedding " ] [ " method " ]
2022-07-28 10:04:49 +02:00
2022-12-27 15:24:31 +01:00
assert mocked_document_store . space_type == " innerproduct "
with pytest . raises (
DocumentStoreError ,
2023-09-14 16:42:48 +02:00
match = rf " Set `similarity` to one of ' \ [ ' l2 ' \ ] ' to properly use the embedding field ' embedding ' of index ' { self . index_name } ' . Similarity ' dot_product ' is not compatible with embedding field ' s space type ' l2 ' , it requires ' innerproduct ' . " ,
2022-12-27 15:24:31 +01:00
) :
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
# l2 is default for space_type so it must pass
mocked_document_store . space_type = " l2 "
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
2022-07-28 10:04:49 +02:00
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test__validate_and_adjust_document_index_similarity ( self , mocked_document_store ) :
mocked_document_store . space_type = " innerproduct "
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
2022-07-28 10:04:49 +02:00
2022-12-27 15:24:31 +01:00
@pytest.mark.unit
def test__validate_and_adjust_document_index_similarity_mismatch ( self , mocked_document_store ) :
mocked_document_store . space_type = " cosinesimil "
with pytest . raises (
DocumentStoreError ,
2023-09-14 16:42:48 +02:00
match = rf " Set `similarity` to one of ' \ [ ' dot_product ' \ ] ' to properly use the embedding field ' embedding ' of index ' { self . index_name } ' . Similarity ' dot_product ' is not compatible with embedding field ' s space type ' innerproduct ' , it requires ' cosinesimil ' . " ,
2022-12-27 15:24:31 +01:00
) :
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
2022-07-28 10:04:49 +02:00
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test__validate_and_adjust_document_index_type_mismatch ( self , mocked_document_store ) :
mocked_document_store . index_type = " hnsw "
2022-07-28 10:04:49 +02:00
2022-12-27 15:24:31 +01:00
with pytest . raises (
DocumentStoreError ,
match = f " The index_type ' hnsw ' needs ' 80 ' as ef_construction value. Currently, the value for embedding field ' embedding ' of index ' { self . index_name } ' is ' 512 ' . " ,
) :
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
@pytest.mark.unit
def test__validate_and_adjust_document_index_change_knn_engine_to_faiss ( self , mocked_document_store ) :
mocked_document_store . knn_engine = " faiss "
with pytest . raises (
DocumentStoreError ,
match = f " Existing embedding field ' { mocked_document_store . embedding_field } ' of OpenSearch index ' { self . index_name } ' has knn_engine ' nmslib ' , but knn_engine was set to ' faiss ' . " ,
) :
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
2022-07-28 10:04:49 +02:00
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test__validate_and_adjust_document_index_change_knn_engine_to_score_script ( self , mocked_document_store ) :
mocked_document_store . knn_engine = " score_script "
mocked_document_store . space_type = " cosinesimil "
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
@pytest.mark.unit
def test__validate_and_adjust_document_index_adjusts_ef_search_for_hnsw_when_default (
self , mocked_document_store , existing_index
2022-07-28 10:04:49 +02:00
) :
"""
2022-12-27 15:24:31 +01:00
Test adjustment when ` knn . algo_param ` is missing from the index settings
2022-07-28 10:04:49 +02:00
"""
2022-12-27 15:24:31 +01:00
existing_index [ " mappings " ] [ " properties " ] [ " embedding " ] [ " method " ] [ " parameters " ] [ " ef_construction " ] = 80
existing_index [ " mappings " ] [ " properties " ] [ " embedding " ] [ " method " ] [ " parameters " ] [ " m " ] = 64
2022-07-28 10:04:49 +02:00
mocked_document_store . index_type = " hnsw "
2022-12-27 15:24:31 +01:00
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
2022-07-28 10:04:49 +02:00
2022-12-27 15:24:31 +01:00
# assert the resulting body contains the adjusted params
2022-07-28 10:04:49 +02:00
_ , kwargs = mocked_document_store . client . indices . put_settings . call_args
assert kwargs [ " body " ] == { " knn.algo_param.ef_search " : 20 }
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test__validate_and_adjust_document_index_adjusts_ef_search_for_hnsw_when_set_different (
self , mocked_document_store , existing_index
) :
2022-07-28 10:04:49 +02:00
"""
Test a value of ` knn . algo_param ` that needs to be adjusted
"""
2022-12-27 15:24:31 +01:00
existing_index [ " mappings " ] [ " properties " ] [ " embedding " ] [ " method " ] [ " parameters " ] [ " ef_construction " ] = 80
existing_index [ " mappings " ] [ " properties " ] [ " embedding " ] [ " method " ] [ " parameters " ] [ " m " ] = 64
existing_index [ " settings " ] [ " index " ] [ " knn.algo_param " ] = { " ef_search " : 999 }
2022-07-28 10:04:49 +02:00
mocked_document_store . index_type = " hnsw "
2022-12-27 15:24:31 +01:00
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
2022-07-28 10:04:49 +02:00
# assert the resulting body is contains the adjusted params
_ , kwargs = mocked_document_store . client . indices . put_settings . call_args
assert kwargs [ " body " ] == { " knn.algo_param.ef_search " : 20 }
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test__validate_and_adjust_document_index_ignores_index_setting_ef_search_for_faiss (
self , mocked_document_store , existing_index
) :
mocked_document_store . knn_engine = " faiss "
existing_index [ " mappings " ] [ " properties " ] [ " embedding " ] [ " method " ] [ " engine " ] = " faiss "
existing_index [ " mappings " ] [ " properties " ] [ " embedding " ] [ " method " ] [ " parameters " ] [ " ef_construction " ] = 512
existing_index [ " mappings " ] [ " properties " ] [ " embedding " ] [ " method " ] [ " parameters " ] [ " m " ] = 16
existing_index [ " settings " ] [ " index " ] [ " knn.algo_param " ] = { " ef_search " : 999 }
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
mocked_document_store . client . indices . put_settings . assert_not_called ( )
@pytest.mark.unit
def test__validate_and_adjust_document_index_ignores_parameter_ef_search_for_nmslib (
self , mocked_document_store , existing_index
) :
existing_index [ " mappings " ] [ " properties " ] [ " embedding " ] [ " method " ] [ " parameters " ] [ " ef_construction " ] = 512
existing_index [ " mappings " ] [ " properties " ] [ " embedding " ] [ " method " ] [ " parameters " ] [ " m " ] = 16
existing_index [ " mappings " ] [ " properties " ] [ " embedding " ] [ " method " ] [ " parameters " ] [ " ef_search " ] = 999
existing_index [ " settings " ] [ " index " ] [ " knn.algo_param " ] = { " ef_search " : 512 }
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
mocked_document_store . client . indices . put_settings . assert_not_called ( )
@pytest.mark.unit
def test__validate_and_adjust_document_index_does_not_adjust_ef_search_for_hnsw_when_set_correct (
self , mocked_document_store , existing_index
2022-07-28 10:04:49 +02:00
) :
"""
2022-12-27 15:24:31 +01:00
If params are already set correctly , we should not adjust them .
2022-07-28 10:04:49 +02:00
"""
2022-12-27 15:24:31 +01:00
existing_index [ " mappings " ] [ " properties " ] [ " embedding " ] [ " method " ] [ " parameters " ] [ " ef_construction " ] = 80
existing_index [ " mappings " ] [ " properties " ] [ " embedding " ] [ " method " ] [ " parameters " ] [ " m " ] = 64
existing_index [ " settings " ] [ " index " ] [ " knn.algo_param " ] = { " ef_search " : 20 }
mocked_document_store . index_type = " hnsw "
2022-07-28 10:04:49 +02:00
2022-12-27 15:24:31 +01:00
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
2022-07-28 10:04:49 +02:00
2022-12-27 15:24:31 +01:00
mocked_document_store . client . indices . put_settings . assert_not_called ( )
2022-07-28 10:04:49 +02:00
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test__validate_and_adjust_document_index_adjusts_ef_search_for_flat_when_set_different (
self , mocked_document_store , existing_index
) :
2022-07-28 10:04:49 +02:00
"""
Test a value of ` knn . algo_param ` that needs to be adjusted
"""
2022-12-27 15:24:31 +01:00
existing_index [ " settings " ] [ " index " ] [ " knn.algo_param " ] = { " ef_search " : 999 }
2022-07-28 10:04:49 +02:00
mocked_document_store . index_type = " flat "
2022-12-27 15:24:31 +01:00
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
2022-07-28 10:04:49 +02:00
# assert the resulting body is contains the adjusted params
_ , kwargs = mocked_document_store . client . indices . put_settings . call_args
assert kwargs [ " body " ] == { " knn.algo_param.ef_search " : 512 }
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test__validate_and_adjust_document_index_does_not_adjust_ef_search_for_flat_when_default (
self , mocked_document_store
) :
"""
If ` knn . algo_param ` is missing , default value needs no adjustments
"""
mocked_document_store . index_type = " flat "
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
mocked_document_store . client . indices . put_settings . assert_not_called ( )
@pytest.mark.unit
def test__validate_and_adjust_document_index_does_not_adjust_ef_search_for_flat_when_set_correct (
self , mocked_document_store , existing_index
) :
"""
If ` knn . algo_param ` is correct , value needs no adjustments
"""
existing_index [ " settings " ] [ " index " ] [ " knn.algo_param " ] = { " ef_search " : 512 }
mocked_document_store . index_type = " flat "
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
mocked_document_store . client . indices . put_settings . assert_not_called ( )
@pytest.mark.unit
def test__validate_and_adjust_document_index_with_non_existing_index ( self , mocked_document_store , caplog ) :
mocked_document_store . client . indices . get . return_value = { }
with caplog . at_level ( logging . WARNING ) :
mocked_document_store . _validate_and_adjust_document_index ( self . index_name )
assert f " The index ' { self . index_name } ' doesn ' t exist. " in caplog . text
@pytest.mark.unit
@pytest.mark.parametrize ( " create_index " , [ True , False ] )
@pytest.mark.parametrize ( " recreate_index " , [ True , False ] )
def test__init_indices_always_calls_validation_if_no_custom_mapping (
self , mocked_document_store , create_index , recreate_index
) :
mocked_document_store . _validate_and_adjust_document_index = MagicMock ( )
mocked_document_store . _init_indices ( self . index_name , " label_index " , create_index , recreate_index )
mocked_document_store . _validate_and_adjust_document_index . assert_called_once ( )
@pytest.mark.unit
@pytest.mark.parametrize ( " create_index " , [ True , False ] )
@pytest.mark.parametrize ( " recreate_index " , [ True , False ] )
def test__init_indices_never_calls_validation_if_custom_mapping (
self , mocked_document_store , create_index , recreate_index , caplog
) :
mocked_document_store . custom_mapping = {
" mappings " : { " properties " : { " embedding " : { " type " : " dense_vector " , " dims " : 768 } } }
}
mocked_document_store . _validate_and_adjust_document_index = MagicMock ( )
with caplog . at_level ( logging . WARNING ) :
mocked_document_store . _init_indices ( self . index_name , " label_index " , create_index , recreate_index )
assert " Skipping index validation " in caplog . text
mocked_document_store . _validate_and_adjust_document_index . assert_not_called ( )
@pytest.mark.unit
def test__init_indices_creates_index_if_not_exists ( self , mocked_document_store ) :
mocked_document_store . client . indices . exists . return_value = False
mocked_document_store . _init_indices ( self . index_name , " label_index " , create_index = True , recreate_index = False )
mocked_document_store . client . indices . create . assert_called ( )
@pytest.mark.unit
def test__init_indices_does_not_create_index_if_exists ( self , mocked_document_store ) :
mocked_document_store . _init_indices ( self . index_name , " label_index " , create_index = True , recreate_index = False )
mocked_document_store . client . indices . create . assert_not_called ( )
@pytest.mark.unit
def test__init_indices_does_not_create_index_if_not_create_index ( self , mocked_document_store ) :
2022-07-28 10:04:49 +02:00
mocked_document_store . client . indices . exists . return_value = False
2022-12-27 15:24:31 +01:00
mocked_document_store . _init_indices ( self . index_name , " label_index " , create_index = False , recreate_index = False )
mocked_document_store . client . indices . create . assert_not_called ( )
@pytest.mark.unit
def test__init_indices_creates_index_if_exists_and_recreate_index ( self , mocked_document_store ) :
2023-02-17 10:28:36 +01:00
# delete_index asks four times: one check for doc index, one check for label index
# + one check for both if ivf model exists
# create_index asks two times: one for doc index, one for label index
mocked_document_store . client . indices . exists . side_effect = [ True , False , True , False , False , False ]
2022-12-27 15:24:31 +01:00
mocked_document_store . _init_indices ( self . index_name , " label_index " , create_index = True , recreate_index = True )
mocked_document_store . client . indices . delete . assert_called ( )
mocked_document_store . client . indices . create . assert_called ( )
@pytest.mark.unit
def test__create_document_index_no_index_custom_mapping ( self , mocked_document_store ) :
2022-07-28 10:04:49 +02:00
mocked_document_store . custom_mapping = { " mappings " : { " properties " : { " a_number " : { " type " : " integer " } } } }
mocked_document_store . _create_document_index ( self . index_name )
_ , kwargs = mocked_document_store . client . indices . create . call_args
assert kwargs [ " body " ] == { " mappings " : { " properties " : { " a_number " : { " type " : " integer " } } } }
2022-12-27 15:24:31 +01:00
assert mocked_document_store . knn_engine == " nmslib "
assert mocked_document_store . space_type == " innerproduct "
2022-07-28 10:04:49 +02:00
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__create_document_index_no_index_no_mapping ( self , mocked_document_store ) :
mocked_document_store . _create_document_index ( self . index_name )
_ , kwargs = mocked_document_store . client . indices . create . call_args
assert kwargs [ " body " ] == {
" mappings " : {
" dynamic_templates " : [
{ " strings " : { " mapping " : { " type " : " keyword " } , " match_mapping_type " : " string " , " path_match " : " * " } }
] ,
" properties " : {
" content " : { " type " : " text " } ,
" embedding " : {
" dimension " : 768 ,
" method " : {
" engine " : " nmslib " ,
" name " : " hnsw " ,
" parameters " : { " ef_construction " : 512 , " m " : 16 } ,
" space_type " : " innerproduct " ,
} ,
" type " : " knn_vector " ,
} ,
" name " : { " type " : " keyword " } ,
} ,
} ,
" settings " : { " analysis " : { " analyzer " : { " default " : { " type " : " standard " } } } , " index " : { " knn " : True } } ,
}
2022-12-27 15:24:31 +01:00
assert mocked_document_store . knn_engine == " nmslib "
assert mocked_document_store . space_type == " innerproduct "
2022-07-28 10:04:49 +02:00
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__create_document_index_no_index_no_mapping_with_synonyms ( self , mocked_document_store ) :
mocked_document_store . search_fields = [ " occupation " ]
mocked_document_store . synonyms = [ " foo " ]
mocked_document_store . _create_document_index ( self . index_name )
_ , kwargs = mocked_document_store . client . indices . create . call_args
assert kwargs [ " body " ] == {
" mappings " : {
" properties " : {
" name " : { " type " : " keyword " } ,
" content " : { " type " : " text " , " analyzer " : " synonym " } ,
" occupation " : { " type " : " text " , " analyzer " : " synonym " } ,
" embedding " : {
" type " : " knn_vector " ,
" dimension " : 768 ,
" method " : {
" space_type " : " innerproduct " ,
" name " : " hnsw " ,
" engine " : " nmslib " ,
" parameters " : { " ef_construction " : 512 , " m " : 16 } ,
} ,
} ,
} ,
" dynamic_templates " : [
{ " strings " : { " path_match " : " * " , " match_mapping_type " : " string " , " mapping " : { " type " : " keyword " } } }
] ,
} ,
" settings " : {
" analysis " : {
" analyzer " : {
" default " : { " type " : " standard " } ,
" synonym " : { " tokenizer " : " whitespace " , " filter " : [ " lowercase " , " synonym " ] } ,
} ,
" filter " : { " synonym " : { " type " : " synonym " , " synonyms " : [ " foo " ] } } ,
} ,
" index " : { " knn " : True } ,
} ,
}
2022-12-27 15:24:31 +01:00
assert mocked_document_store . knn_engine == " nmslib "
assert mocked_document_store . space_type == " innerproduct "
2022-07-28 10:04:49 +02:00
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__create_document_index_no_index_no_mapping_with_embedding_field ( self , mocked_document_store ) :
mocked_document_store . embedding_field = " vec "
mocked_document_store . index_type = " hnsw "
mocked_document_store . _create_document_index ( self . index_name )
_ , kwargs = mocked_document_store . client . indices . create . call_args
assert kwargs [ " body " ] == {
" mappings " : {
" properties " : {
" name " : { " type " : " keyword " } ,
" content " : { " type " : " text " } ,
" vec " : {
" type " : " knn_vector " ,
" dimension " : 768 ,
" method " : {
" space_type " : " innerproduct " ,
" name " : " hnsw " ,
" engine " : " nmslib " ,
" parameters " : { " ef_construction " : 80 , " m " : 64 } ,
} ,
} ,
} ,
" dynamic_templates " : [
{ " strings " : { " path_match " : " * " , " match_mapping_type " : " string " , " mapping " : { " type " : " keyword " } } }
] ,
} ,
" settings " : {
" analysis " : { " analyzer " : { " default " : { " type " : " standard " } } } ,
" index " : { " knn " : True , " knn.algo_param.ef_search " : 20 } ,
} ,
}
2022-12-27 15:24:31 +01:00
assert mocked_document_store . knn_engine == " nmslib "
assert mocked_document_store . space_type == " innerproduct "
2022-07-28 10:04:49 +02:00
2022-08-24 16:43:48 +02:00
@pytest.mark.unit
def test__create_document_index_no_index_no_mapping_faiss ( self , mocked_document_store ) :
mocked_document_store . knn_engine = " faiss "
mocked_document_store . _create_document_index ( self . index_name )
_ , kwargs = mocked_document_store . client . indices . create . call_args
assert kwargs [ " body " ] == {
" mappings " : {
" dynamic_templates " : [
{ " strings " : { " mapping " : { " type " : " keyword " } , " match_mapping_type " : " string " , " path_match " : " * " } }
] ,
" properties " : {
" content " : { " type " : " text " } ,
" embedding " : {
" dimension " : 768 ,
" method " : {
" engine " : " faiss " ,
" name " : " hnsw " ,
" parameters " : { " ef_construction " : 512 , " m " : 16 } ,
" space_type " : " innerproduct " ,
} ,
" type " : " knn_vector " ,
} ,
" name " : { " type " : " keyword " } ,
} ,
} ,
" settings " : { " analysis " : { " analyzer " : { " default " : { " type " : " standard " } } } , " index " : { " knn " : True } } ,
}
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__create_document_index_client_failure ( self , mocked_document_store ) :
mocked_document_store . client . indices . exists . return_value = False
mocked_document_store . client . indices . create . side_effect = RequestError
with pytest . raises ( RequestError ) :
mocked_document_store . _create_document_index ( self . index_name )
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__get_embedding_field_mapping_flat ( self , mocked_document_store ) :
mocked_document_store . index_type = " flat "
2022-12-27 15:24:31 +01:00
assert mocked_document_store . _get_embedding_field_mapping ( ) == {
2022-07-28 10:04:49 +02:00
" type " : " knn_vector " ,
" dimension " : 768 ,
" method " : {
" space_type " : " innerproduct " ,
" name " : " hnsw " ,
" engine " : " nmslib " ,
" parameters " : { " ef_construction " : 512 , " m " : 16 } ,
} ,
}
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2023-02-17 10:28:36 +01:00
def test__get_embedding_field_mapping_default_hnsw ( self , mocked_document_store ) :
2022-07-28 10:04:49 +02:00
mocked_document_store . index_type = " hnsw "
2022-12-27 15:24:31 +01:00
assert mocked_document_store . _get_embedding_field_mapping ( ) == {
2022-07-28 10:04:49 +02:00
" type " : " knn_vector " ,
" dimension " : 768 ,
" method " : {
" space_type " : " innerproduct " ,
" name " : " hnsw " ,
" engine " : " nmslib " ,
" parameters " : { " ef_construction " : 80 , " m " : 64 } ,
} ,
}
2022-08-24 16:43:48 +02:00
@pytest.mark.unit
2023-02-17 10:28:36 +01:00
def test__get_embedding_field_mapping_default_hnsw_faiss ( self , mocked_document_store ) :
2022-08-24 16:43:48 +02:00
mocked_document_store . index_type = " hnsw "
mocked_document_store . knn_engine = " faiss "
2022-12-27 15:24:31 +01:00
assert mocked_document_store . _get_embedding_field_mapping ( ) == {
2022-08-24 16:43:48 +02:00
" type " : " knn_vector " ,
" dimension " : 768 ,
" method " : {
" space_type " : " innerproduct " ,
" name " : " hnsw " ,
" engine " : " faiss " ,
" parameters " : { " ef_construction " : 80 , " m " : 64 , " ef_search " : 20 } ,
} ,
}
2023-02-17 10:28:36 +01:00
@pytest.mark.unit
def test__get_embedding_field_mapping_custom_hnsw ( self , mocked_document_store ) :
mocked_document_store . index_type = " hnsw "
mocked_document_store . knn_parameters = { " ef_construction " : 1 , " m " : 2 }
assert mocked_document_store . _get_embedding_field_mapping ( ) == {
" type " : " knn_vector " ,
" dimension " : 768 ,
" method " : {
" space_type " : " innerproduct " ,
" engine " : " nmslib " ,
" name " : " hnsw " ,
" parameters " : { " ef_construction " : 1 , " m " : 2 } ,
} ,
}
@pytest.mark.unit
def test__get_embedding_field_mapping_custom_hnsw_faiss ( self , mocked_document_store ) :
mocked_document_store . index_type = " hnsw "
mocked_document_store . knn_engine = " faiss "
mocked_document_store . knn_parameters = { " ef_construction " : 1 , " m " : 2 , " ef_search " : 3 }
assert mocked_document_store . _get_embedding_field_mapping ( ) == {
" type " : " knn_vector " ,
" dimension " : 768 ,
" method " : {
" space_type " : " innerproduct " ,
" engine " : " faiss " ,
" name " : " hnsw " ,
" parameters " : { " ef_construction " : 1 , " m " : 2 , " ef_search " : 3 } ,
} ,
}
@pytest.mark.unit
def test__get_embedding_field_mapping_ivf ( self , mocked_document_store ) :
mocked_document_store . index_type = " ivf "
mocked_document_store . knn_engine = " faiss "
mocked_document_store . client . indices . exists . return_value = False
# Before training, IVF indices use HNSW with default settings
assert mocked_document_store . _get_embedding_field_mapping ( ) == { " type " : " knn_vector " , " dimension " : 768 }
# Assume we have trained the index
mocked_document_store . client . indices . exists . return_value = True
mocked_document_store . client . transport . perform_request . return_value = {
" took " : 4 ,
" timed_out " : False ,
" _shards " : { " total " : 1 , " successful " : 1 , " skipped " : 0 , " failed " : 0 } ,
" hits " : {
" total " : { " value " : 1 , " relation " : " eq " } ,
" max_score " : 1.0 ,
" hits " : [
{
" _index " : " .opensearch-knn-models " ,
" _type " : " _doc " ,
" _id " : " document-ivf " ,
" _score " : 1.0 ,
" _source " : {
" model_blob " : " <SOME MODEL BLOB> " ,
" engine " : " faiss " ,
" space_type " : " innerproduct " ,
" description " : " index_type:ivf nlist:4 nprobes:1 " ,
" model_id " : f " { mocked_document_store . index } -ivf " ,
" state " : " created " ,
" error " : " " ,
" dimension " : 768 ,
" timestamp " : " 2023-01-25T16:04:21.284398Z " ,
} ,
}
] ,
} ,
}
assert mocked_document_store . _get_embedding_field_mapping ( ) == {
" type " : " knn_vector " ,
" model_id " : f " { mocked_document_store . index } -ivf " ,
}
@pytest.mark.unit
def test__get_embedding_field_mapping_ivfpq ( self , mocked_document_store ) :
mocked_document_store . index_type = " ivf_pq "
mocked_document_store . knn_engine = " faiss "
mocked_document_store . client . indices . exists . return_value = False
# Before training, IVF indices use HNSW with default settings
assert mocked_document_store . _get_embedding_field_mapping ( ) == { " type " : " knn_vector " , " dimension " : 768 }
# Assume we have trained the index
mocked_document_store . client . indices . exists . return_value = True
mocked_document_store . client . transport . perform_request . return_value = {
" took " : 4 ,
" timed_out " : False ,
" _shards " : { " total " : 1 , " successful " : 1 , " skipped " : 0 , " failed " : 0 } ,
" hits " : {
" total " : { " value " : 1 , " relation " : " eq " } ,
" max_score " : 1.0 ,
" hits " : [
{
" _index " : " .opensearch-knn-models " ,
" _type " : " _doc " ,
" _id " : " document-ivf " ,
" _score " : 1.0 ,
" _source " : {
" model_blob " : " <SOME MODEL BLOB> " ,
" engine " : " faiss " ,
" space_type " : " innerproduct " ,
" description " : " index_type:ivf_pq nlist:4 nprobes:1 m:1 code_size:8 " ,
" model_id " : f " { mocked_document_store . index } -ivf " ,
" state " : " created " ,
" error " : " " ,
" dimension " : 768 ,
" timestamp " : " 2023-01-25T16:04:21.284398Z " ,
} ,
}
] ,
} ,
}
assert mocked_document_store . _get_embedding_field_mapping ( ) == {
" type " : " knn_vector " ,
" model_id " : f " { mocked_document_store . index } -ivf " ,
}
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__get_embedding_field_mapping_wrong ( self , mocked_document_store , caplog ) :
mocked_document_store . index_type = " foo "
with caplog . at_level ( logging . ERROR , logger = " haystack.document_stores.opensearch " ) :
2022-12-27 15:24:31 +01:00
retval = mocked_document_store . _get_embedding_field_mapping ( )
2022-07-28 10:04:49 +02:00
2023-02-17 10:28:36 +01:00
assert " Set index_type to either ' flat ' , ' hnsw ' , ' ivf ' , or ' ivf_pq ' " in caplog . text
2022-07-28 10:04:49 +02:00
assert retval == {
" type " : " knn_vector " ,
" dimension " : 768 ,
" method " : { " space_type " : " innerproduct " , " name " : " hnsw " , " engine " : " nmslib " } ,
}
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__create_label_index_already_exists ( self , mocked_document_store ) :
mocked_document_store . client . indices . exists . return_value = True
2022-12-27 15:24:31 +01:00
mocked_document_store . _init_indices ( " doc_index " , " label_index " , True , False )
2022-07-28 10:04:49 +02:00
mocked_document_store . client . indices . create . assert_not_called ( )
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__create_label_index_client_error ( self , mocked_document_store ) :
mocked_document_store . client . indices . exists . return_value = False
mocked_document_store . client . indices . create . side_effect = RequestError
with pytest . raises ( RequestError ) :
mocked_document_store . _create_label_index ( " foo " )
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__get_vector_similarity_query_support_true ( self , mocked_document_store ) :
mocked_document_store . embedding_field = " FooField "
2022-12-27 15:24:31 +01:00
assert mocked_document_store . knn_engine != " score_script "
2022-07-28 10:04:49 +02:00
assert mocked_document_store . _get_vector_similarity_query ( self . query_emb , 3 ) == {
" bool " : { " must " : [ { " knn " : { " FooField " : { " vector " : self . query_emb . tolist ( ) , " k " : 3 } } } ] }
}
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__get_vector_similarity_query_support_false ( self , mocked_document_store ) :
mocked_document_store . embedding_field = " FooField "
2022-12-27 15:24:31 +01:00
mocked_document_store . knn_engine = " score_script "
mocked_document_store . space_type = " innerproduct "
2022-07-28 10:04:49 +02:00
assert mocked_document_store . _get_vector_similarity_query ( self . query_emb , 3 ) == {
" script_score " : {
" query " : { " match_all " : { } } ,
" script " : {
" source " : " knn_score " ,
" lang " : " knn " ,
" params " : {
" field " : " FooField " ,
" query_value " : self . query_emb . tolist ( ) ,
" space_type " : " innerproduct " ,
} ,
} ,
}
}
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__get_raw_similarity_score_dot ( self , mocked_document_store ) :
mocked_document_store . similarity = " dot_product "
assert mocked_document_store . _get_raw_similarity_score ( 2 ) == 1
assert mocked_document_store . _get_raw_similarity_score ( - 2 ) == 1.5
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__get_raw_similarity_score_l2 ( self , mocked_document_store ) :
mocked_document_store . similarity = " l2 "
assert mocked_document_store . _get_raw_similarity_score ( 1 ) == 0
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-07-28 10:04:49 +02:00
def test__get_raw_similarity_score_cosine ( self , mocked_document_store ) :
2022-12-27 15:24:31 +01:00
mocked_document_store . space_type = " cosinesimil "
assert mocked_document_store . knn_engine != " score_script "
2022-07-28 10:04:49 +02:00
assert mocked_document_store . _get_raw_similarity_score ( 1 ) == 1
2022-12-27 15:24:31 +01:00
mocked_document_store . knn_engine = " score_script "
2022-07-28 10:04:49 +02:00
assert mocked_document_store . _get_raw_similarity_score ( 1 ) == 0
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test_clone_embedding_field_duplicate_mapping ( self , mocked_document_store ) :
2022-07-28 10:04:49 +02:00
mocked_document_store . index = self . index_name
2022-12-27 15:24:31 +01:00
with pytest . raises ( Exception , match = " embedding already exists with mapping " ) :
mocked_document_store . clone_embedding_field ( " embedding " , " cosine " )
2022-07-28 10:04:49 +02:00
2022-08-03 19:19:07 +02:00
@pytest.mark.unit
2022-12-27 15:24:31 +01:00
def test_clone_embedding_field_update_mapping ( self , mocked_document_store , monkeypatch ) :
2022-07-28 10:04:49 +02:00
mocked_document_store . index = self . index_name
# Mock away tqdm and the batch logic so we can test the mapping update alone
mocked_document_store . _get_all_documents_in_index = MagicMock ( return_value = [ ] )
monkeypatch . setattr ( tqdm , " __new__ " , MagicMock ( ) )
mocked_document_store . clone_embedding_field ( " a_field " , " cosine " )
_ , kwargs = mocked_document_store . client . indices . put_mapping . call_args
assert kwargs [ " body " ] [ " properties " ] [ " a_field " ] == {
" type " : " knn_vector " ,
" dimension " : 768 ,
" method " : {
" space_type " : " cosinesimil " ,
" name " : " hnsw " ,
" engine " : " nmslib " ,
" parameters " : { " ef_construction " : 512 , " m " : 16 } ,
} ,
}
2022-09-13 14:30:30 +01:00
@pytest.mark.unit
def test_bulk_write_retries_for_always_failing_insert_is_canceled ( self , mocked_document_store , monkeypatch , caplog ) :
docs_to_write = [
{ " meta " : { " name " : f " name_ { i } " } , " content " : f " text_ { i } " , " embedding " : np . random . rand ( 768 ) . astype ( np . float32 ) }
for i in range ( 1000 )
]
2022-10-13 11:53:27 +02:00
with patch ( " haystack.document_stores.opensearch.bulk " ) as mocked_bulk :
2022-09-13 14:30:30 +01:00
mocked_bulk . side_effect = opensearchpy . TransportError ( 429 , " Too many requests " )
with pytest . raises ( DocumentStoreError , match = " Last try of bulk indexing documents failed. " ) :
mocked_document_store . _bulk ( documents = docs_to_write , _timeout = 0 , _remaining_tries = 3 )
2023-09-13 16:14:45 +02:00
assert mocked_bulk . call_count == 3 # depth first search fails and cancels the whole bulk request
2022-09-13 14:30:30 +01:00
2023-09-13 16:14:45 +02:00
assert " Too Many Requests " in caplog . text
2022-09-13 14:30:30 +01:00
assert " Splitting the number of documents into two chunks with the same size " in caplog . text
@pytest.mark.unit
def test_bulk_write_retries_with_backoff_with_smaller_batch_size_on_too_many_requests (
self , mocked_document_store , monkeypatch
) :
docs_to_write = [
{ " meta " : { " name " : f " name_ { i } " } , " content " : f " text_ { i } " , " embedding " : np . random . rand ( 768 ) . astype ( np . float32 ) }
for i in range ( 1000 )
]
2022-10-13 11:53:27 +02:00
with patch ( " haystack.document_stores.opensearch.bulk " ) as mocked_bulk :
2022-09-13 14:30:30 +01:00
# make bulk insert split documents and request retries s.t.
# 1k => 500 (failed) + 500 (successful) => 250 (successful) + 250 (successful)
# resulting in 5 calls in total
mocked_bulk . side_effect = [
opensearchpy . TransportError ( 429 , " Too many requests " ) ,
opensearchpy . TransportError ( 429 , " Too many requests " ) ,
None ,
None ,
None ,
]
mocked_document_store . _bulk ( documents = docs_to_write , _timeout = 0 , _remaining_tries = 3 )
assert mocked_bulk . call_count == 5
2023-04-18 15:40:17 +02:00
@pytest.mark.unit
def test_get_document_by_id_return_embedding_false ( self , mocked_document_store ) :
mocked_document_store . return_embedding = False
mocked_document_store . get_document_by_id ( " 123 " )
# assert the resulting body is consistent with the `excluded_meta_data` value
_ , kwargs = mocked_document_store . client . search . call_args
assert kwargs [ " body " ] [ " _source " ] == { " excludes " : [ " embedding " ] }
@pytest.mark.unit
def test_get_document_by_id_excluded_meta_data_has_no_influence ( self , mocked_document_store ) :
mocked_document_store . excluded_meta_data = [ " foo " ]
mocked_document_store . return_embedding = False
mocked_document_store . get_document_by_id ( " 123 " )
# assert the resulting body is not affected by the `excluded_meta_data` value
_ , kwargs = mocked_document_store . client . search . call_args
assert kwargs [ " body " ] [ " _source " ] == { " excludes " : [ " embedding " ] }
2023-06-01 18:47:24 +02:00
@pytest.mark.unit
def test_write_documents_req_for_each_batch ( self , mocked_document_store , documents ) :
mocked_document_store . batch_size = 2
with patch ( " haystack.document_stores.opensearch.bulk " ) as mocked_bulk :
mocked_document_store . write_documents ( documents )
assert mocked_bulk . call_count == 5