2023-02-16 09:43:25 +01:00
|
|
|
import os
|
|
|
|
import uuid
|
|
|
|
from contextlib import contextmanager
|
2023-03-01 17:30:55 +01:00
|
|
|
import random
|
2023-03-01 14:52:05 +01:00
|
|
|
from pathlib import Path
|
2023-02-16 09:43:25 +01:00
|
|
|
|
2023-03-01 17:30:55 +01:00
|
|
|
import torch
|
|
|
|
import numpy as np
|
|
|
|
|
2023-02-16 09:43:25 +01:00
|
|
|
import pytest
|
|
|
|
|
|
|
|
from haystack.schema import Document
|
|
|
|
from haystack.document_stores import (
|
|
|
|
InMemoryDocumentStore,
|
|
|
|
ElasticsearchDocumentStore,
|
|
|
|
WeaviateDocumentStore,
|
|
|
|
MilvusDocumentStore,
|
|
|
|
PineconeDocumentStore,
|
|
|
|
OpenSearchDocumentStore,
|
|
|
|
FAISSDocumentStore,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2023-03-06 17:26:19 +01:00
|
|
|
SAMPLES_PATH = Path(__file__).parent / "samples"
|
2023-03-01 14:52:05 +01:00
|
|
|
|
|
|
|
|
2023-03-01 17:30:55 +01:00
|
|
|
# Fix all random seeds that come to mind
|
|
|
|
torch.manual_seed(0)
|
|
|
|
np.random.seed(0)
|
|
|
|
random.seed(0)
|
|
|
|
|
|
|
|
|
2023-02-16 09:43:25 +01:00
|
|
|
@pytest.fixture
|
|
|
|
def docs_all_formats():
|
|
|
|
return [
|
|
|
|
# metafield at the top level for backward compatibility
|
|
|
|
{
|
|
|
|
"content": "My name is Paul and I live in New York",
|
|
|
|
"meta_field": "test2",
|
|
|
|
"name": "filename2",
|
|
|
|
"date_field": "2019-10-01",
|
|
|
|
"numeric_field": 5.0,
|
|
|
|
},
|
|
|
|
# "dict" format
|
|
|
|
{
|
|
|
|
"content": "My name is Carla and I live in Berlin",
|
|
|
|
"meta": {"meta_field": "test1", "name": "filename1", "date_field": "2020-03-01", "numeric_field": 5.5},
|
|
|
|
},
|
|
|
|
# Document object
|
|
|
|
Document(
|
|
|
|
content="My name is Christelle and I live in Paris",
|
|
|
|
meta={"meta_field": "test3", "name": "filename3", "date_field": "2018-10-01", "numeric_field": 4.5},
|
|
|
|
),
|
|
|
|
Document(
|
|
|
|
content="My name is Camila and I live in Madrid",
|
|
|
|
meta={"meta_field": "test4", "name": "filename4", "date_field": "2021-02-01", "numeric_field": 3.0},
|
|
|
|
),
|
|
|
|
Document(
|
|
|
|
content="My name is Matteo and I live in Rome",
|
|
|
|
meta={"meta_field": "test5", "name": "filename5", "date_field": "2019-01-01", "numeric_field": 0.0},
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def docs(docs_all_formats):
|
|
|
|
return [Document.from_dict(doc) if isinstance(doc, dict) else doc for doc in docs_all_formats]
|
|
|
|
|
|
|
|
|
|
|
|
@contextmanager
|
|
|
|
def document_store(
|
|
|
|
name,
|
|
|
|
docs,
|
|
|
|
tmp_path,
|
|
|
|
embedding_dim=768,
|
|
|
|
embedding_field="embedding",
|
|
|
|
index="haystack_test",
|
|
|
|
similarity="cosine", # cosine is default similarity as dot product is not supported by Weaviate
|
|
|
|
recreate_index=True,
|
|
|
|
):
|
|
|
|
if name == "memory":
|
|
|
|
document_store = InMemoryDocumentStore(
|
|
|
|
return_embedding=True,
|
|
|
|
embedding_dim=embedding_dim,
|
|
|
|
embedding_field=embedding_field,
|
|
|
|
index=index,
|
|
|
|
similarity=similarity,
|
|
|
|
use_bm25=True,
|
|
|
|
)
|
|
|
|
|
|
|
|
elif name == "elasticsearch":
|
|
|
|
# make sure we start from a fresh index
|
|
|
|
document_store = ElasticsearchDocumentStore(
|
|
|
|
index=index,
|
|
|
|
return_embedding=True,
|
|
|
|
embedding_dim=embedding_dim,
|
|
|
|
embedding_field=embedding_field,
|
|
|
|
similarity=similarity,
|
|
|
|
recreate_index=recreate_index,
|
|
|
|
)
|
|
|
|
|
|
|
|
elif name == "faiss":
|
|
|
|
document_store = FAISSDocumentStore(
|
|
|
|
embedding_dim=embedding_dim,
|
|
|
|
sql_url=f"sqlite:///{tmp_path}/haystack_test.db",
|
|
|
|
return_embedding=True,
|
|
|
|
embedding_field=embedding_field,
|
|
|
|
index=index,
|
|
|
|
similarity=similarity,
|
|
|
|
isolation_level="AUTOCOMMIT",
|
|
|
|
)
|
|
|
|
|
|
|
|
elif name == "milvus":
|
|
|
|
document_store = MilvusDocumentStore(
|
|
|
|
embedding_dim=embedding_dim,
|
|
|
|
sql_url=f"sqlite:///{tmp_path}/haystack_test.db",
|
|
|
|
return_embedding=True,
|
|
|
|
embedding_field=embedding_field,
|
|
|
|
index=index,
|
|
|
|
similarity=similarity,
|
|
|
|
isolation_level="AUTOCOMMIT",
|
|
|
|
recreate_index=recreate_index,
|
|
|
|
)
|
|
|
|
|
|
|
|
elif name == "weaviate":
|
|
|
|
document_store = WeaviateDocumentStore(
|
|
|
|
index=index, similarity=similarity, embedding_dim=embedding_dim, recreate_index=recreate_index
|
|
|
|
)
|
|
|
|
for d in docs:
|
|
|
|
d.id = str(uuid.uuid4())
|
|
|
|
|
|
|
|
elif name == "pinecone":
|
|
|
|
document_store = PineconeDocumentStore(
|
|
|
|
api_key=os.environ.get("PINECONE_API_KEY") or "fake-haystack-test-key",
|
|
|
|
embedding_dim=embedding_dim,
|
|
|
|
embedding_field=embedding_field,
|
|
|
|
index=index,
|
|
|
|
similarity=similarity,
|
|
|
|
recreate_index=recreate_index,
|
|
|
|
metadata_config={
|
|
|
|
"indexed": [
|
|
|
|
"meta_field",
|
|
|
|
"name",
|
|
|
|
"date_field",
|
|
|
|
"numeric_field",
|
|
|
|
"f1",
|
|
|
|
"f3",
|
|
|
|
"meta_id",
|
|
|
|
"meta_field_for_count",
|
|
|
|
"meta_key_1",
|
|
|
|
"meta_key_2",
|
|
|
|
]
|
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
elif name == "opensearch_faiss":
|
|
|
|
document_store = OpenSearchDocumentStore(
|
|
|
|
index=index,
|
|
|
|
return_embedding=True,
|
|
|
|
embedding_dim=embedding_dim,
|
|
|
|
embedding_field=embedding_field,
|
|
|
|
similarity=similarity,
|
|
|
|
recreate_index=recreate_index,
|
|
|
|
port=9201,
|
|
|
|
knn_engine="faiss",
|
|
|
|
)
|
|
|
|
|
|
|
|
else:
|
|
|
|
raise Exception(f"No document store fixture for '{name}'")
|
|
|
|
|
|
|
|
document_store.write_documents(docs)
|
|
|
|
yield document_store
|
|
|
|
document_store.delete_index(document_store.index)
|