2021-06-10 13:13:53 +05:30
import numpy as np
import pytest
2021-10-25 15:50:23 +02:00
from haystack . schema import Document
2021-06-10 13:13:53 +05:30
from conftest import get_document_store
import uuid
embedding_dim = 768
2021-11-04 09:27:12 +01:00
2021-06-10 13:13:53 +05:30
def get_uuid ( ) :
return str ( uuid . uuid4 ( ) )
2021-11-04 09:27:12 +01:00
2021-06-10 13:13:53 +05:30
DOCUMENTS = [
2021-11-04 09:27:12 +01:00
{ " content " : " text1 " , " id " : " not a correct uuid " , " key " : " a " } ,
2021-10-13 14:23:23 +02:00
{ " content " : " text2 " , " id " : get_uuid ( ) , " key " : " b " , " embedding " : np . random . rand ( embedding_dim ) . astype ( np . float32 ) } ,
{ " content " : " text3 " , " id " : get_uuid ( ) , " key " : " b " , " embedding " : np . random . rand ( embedding_dim ) . astype ( np . float32 ) } ,
{ " content " : " text4 " , " id " : get_uuid ( ) , " key " : " b " , " embedding " : np . random . rand ( embedding_dim ) . astype ( np . float32 ) } ,
{ " content " : " text5 " , " id " : get_uuid ( ) , " key " : " b " , " embedding " : np . random . rand ( embedding_dim ) . astype ( np . float32 ) } ,
2021-06-10 13:13:53 +05:30
]
DOCUMENTS_XS = [
# current "dict" format for a document
2021-10-13 14:23:23 +02:00
{ " content " : " My name is Carla and I live in Berlin " , " id " : get_uuid ( ) , " meta " : { " metafield " : " test1 " , " name " : " filename1 " } , " embedding " : np . random . rand ( embedding_dim ) . astype ( np . float32 ) } ,
2021-06-10 13:13:53 +05:30
# meta_field at the top level for backward compatibility
2021-10-13 14:23:23 +02:00
{ " content " : " My name is Paul and I live in New York " , " id " : get_uuid ( ) , " metafield " : " test2 " , " name " : " filename2 " , " embedding " : np . random . rand ( embedding_dim ) . astype ( np . float32 ) } ,
2021-06-10 13:13:53 +05:30
# Document object for a doc
2021-10-13 14:23:23 +02:00
Document ( content = " My name is Christelle and I live in Paris " , id = get_uuid ( ) , meta = { " metafield " : " test3 " , " name " : " filename3 " } , embedding = np . random . rand ( embedding_dim ) . astype ( np . float32 ) )
2021-06-10 13:13:53 +05:30
]
2021-11-04 09:27:12 +01:00
2021-06-10 13:13:53 +05:30
@pytest.fixture ( params = [ " weaviate " ] )
2022-01-14 13:48:58 +01:00
def document_store_with_docs ( request , tmp_path ) :
document_store = get_document_store ( request . param , tmp_path = tmp_path )
2021-06-10 13:13:53 +05:30
document_store . write_documents ( DOCUMENTS_XS )
yield document_store
2021-08-30 18:48:28 +05:30
document_store . delete_documents ( )
2021-06-10 13:13:53 +05:30
2021-11-04 09:27:12 +01:00
2021-06-10 13:13:53 +05:30
@pytest.fixture ( params = [ " weaviate " ] )
2022-01-14 13:48:58 +01:00
def document_store ( request , tmp_path ) :
document_store = get_document_store ( request . param , tmp_path = tmp_path )
2021-06-10 13:13:53 +05:30
yield document_store
2021-08-30 18:48:28 +05:30
document_store . delete_documents ( )
2021-06-10 13:13:53 +05:30
@pytest.mark.weaviate
@pytest.mark.parametrize ( " document_store " , [ " weaviate " ] , indirect = True )
@pytest.mark.parametrize ( " batch_size " , [ 2 ] )
def test_weaviate_write_docs ( document_store , batch_size ) :
# Write in small batches
for i in range ( 0 , len ( DOCUMENTS ) , batch_size ) :
document_store . write_documents ( DOCUMENTS [ i : i + batch_size ] )
documents_indexed = document_store . get_all_documents ( )
assert len ( documents_indexed ) == len ( DOCUMENTS )
documents_indexed = document_store . get_all_documents ( batch_size = batch_size )
assert len ( documents_indexed ) == len ( DOCUMENTS )
@pytest.mark.weaviate
@pytest.mark.parametrize ( " document_store_with_docs " , [ " weaviate " ] , indirect = True )
def test_query_by_embedding ( document_store_with_docs ) :
docs = document_store_with_docs . query_by_embedding ( np . random . rand ( embedding_dim ) . astype ( np . float32 ) )
assert len ( docs ) == 3
docs = document_store_with_docs . query_by_embedding ( np . random . rand ( embedding_dim ) . astype ( np . float32 ) ,
top_k = 1 )
assert len ( docs ) == 1
docs = document_store_with_docs . query_by_embedding ( np . random . rand ( embedding_dim ) . astype ( np . float32 ) ,
filters = { " name " : [ ' filename2 ' ] } )
assert len ( docs ) == 1
@pytest.mark.weaviate
@pytest.mark.parametrize ( " document_store_with_docs " , [ " weaviate " ] , indirect = True )
def test_query ( document_store_with_docs ) :
query_text = ' My name is Carla and I live in Berlin '
with pytest . raises ( Exception ) :
docs = document_store_with_docs . query ( query_text )
docs = document_store_with_docs . query ( filters = { " name " : [ ' filename2 ' ] } )
assert len ( docs ) == 1
2021-10-13 14:23:23 +02:00
docs = document_store_with_docs . query ( filters = { " content " : [ query_text . lower ( ) ] } )
2021-06-10 13:13:53 +05:30
assert len ( docs ) == 1
2021-10-13 14:23:23 +02:00
docs = document_store_with_docs . query ( filters = { " content " : [ ' live ' ] } )
2021-06-10 13:13:53 +05:30
assert len ( docs ) == 3