import os import uuid from contextlib import contextmanager import random from pathlib import Path import torch import numpy as np import pytest from haystack.schema import Document from haystack.document_stores import ( InMemoryDocumentStore, ElasticsearchDocumentStore, WeaviateDocumentStore, MilvusDocumentStore, PineconeDocumentStore, OpenSearchDocumentStore, FAISSDocumentStore, ) SAMPLES_PATH = Path(__file__).parent.parent / "test" / "samples" # Fix all random seeds that come to mind torch.manual_seed(0) np.random.seed(0) random.seed(0) @pytest.fixture def docs_all_formats(): return [ # metafield at the top level for backward compatibility { "content": "My name is Paul and I live in New York", "meta_field": "test2", "name": "filename2", "date_field": "2019-10-01", "numeric_field": 5.0, }, # "dict" format { "content": "My name is Carla and I live in Berlin", "meta": {"meta_field": "test1", "name": "filename1", "date_field": "2020-03-01", "numeric_field": 5.5}, }, # Document object Document( content="My name is Christelle and I live in Paris", meta={"meta_field": "test3", "name": "filename3", "date_field": "2018-10-01", "numeric_field": 4.5}, ), Document( content="My name is Camila and I live in Madrid", meta={"meta_field": "test4", "name": "filename4", "date_field": "2021-02-01", "numeric_field": 3.0}, ), Document( content="My name is Matteo and I live in Rome", meta={"meta_field": "test5", "name": "filename5", "date_field": "2019-01-01", "numeric_field": 0.0}, ), ] @pytest.fixture def docs(docs_all_formats): return [Document.from_dict(doc) if isinstance(doc, dict) else doc for doc in docs_all_formats] @contextmanager def document_store( name, docs, tmp_path, embedding_dim=768, embedding_field="embedding", index="haystack_test", similarity="cosine", # cosine is default similarity as dot product is not supported by Weaviate recreate_index=True, ): if name == "memory": document_store = InMemoryDocumentStore( return_embedding=True, embedding_dim=embedding_dim, embedding_field=embedding_field, index=index, similarity=similarity, use_bm25=True, ) elif name == "elasticsearch": # make sure we start from a fresh index document_store = ElasticsearchDocumentStore( index=index, return_embedding=True, embedding_dim=embedding_dim, embedding_field=embedding_field, similarity=similarity, recreate_index=recreate_index, ) elif name == "faiss": document_store = FAISSDocumentStore( embedding_dim=embedding_dim, sql_url=f"sqlite:///{tmp_path}/haystack_test.db", return_embedding=True, embedding_field=embedding_field, index=index, similarity=similarity, isolation_level="AUTOCOMMIT", ) elif name == "milvus": document_store = MilvusDocumentStore( embedding_dim=embedding_dim, sql_url=f"sqlite:///{tmp_path}/haystack_test.db", return_embedding=True, embedding_field=embedding_field, index=index, similarity=similarity, isolation_level="AUTOCOMMIT", recreate_index=recreate_index, ) elif name == "weaviate": document_store = WeaviateDocumentStore( index=index, similarity=similarity, embedding_dim=embedding_dim, recreate_index=recreate_index ) for d in docs: d.id = str(uuid.uuid4()) elif name == "pinecone": document_store = PineconeDocumentStore( api_key=os.environ.get("PINECONE_API_KEY") or "fake-haystack-test-key", embedding_dim=embedding_dim, embedding_field=embedding_field, index=index, similarity=similarity, recreate_index=recreate_index, metadata_config={ "indexed": [ "meta_field", "name", "date_field", "numeric_field", "f1", "f3", "meta_id", "meta_field_for_count", "meta_key_1", "meta_key_2", ] }, ) elif name == "opensearch_faiss": document_store = OpenSearchDocumentStore( index=index, return_embedding=True, embedding_dim=embedding_dim, embedding_field=embedding_field, similarity=similarity, recreate_index=recreate_index, port=9201, knn_engine="faiss", ) else: raise Exception(f"No document store fixture for '{name}'") document_store.write_documents(docs) yield document_store document_store.delete_index(document_store.index)