mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-31 12:48:43 +00:00
338 lines
18 KiB
Python
338 lines
18 KiB
Python
![]() |
from typing import List, Union, Dict, Any
|
||
|
|
||
|
import os
|
||
|
from datetime import datetime
|
||
|
from inspect import getmembers, isclass, isfunction
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
from haystack.document_stores.pinecone import PineconeDocumentStore
|
||
|
from haystack.schema import Document
|
||
|
from haystack.errors import FilterError
|
||
|
|
||
|
|
||
|
from ..mocks import pinecone as pinecone_mock
|
||
|
from ..conftest import SAMPLES_PATH
|
||
|
|
||
|
|
||
|
# Set metadata fields used during testing for PineconeDocumentStore meta_config
|
||
|
META_FIELDS = ["meta_field", "name", "date", "numeric_field", "odd_document"]
|
||
|
|
||
|
|
||
|
#
|
||
|
# FIXME This class should extend the base Document Store test class once it exists.
|
||
|
# At that point some of the fixtures will be duplicate, so review them.
|
||
|
#
|
||
|
class TestPineconeDocumentStore:
|
||
|
|
||
|
# Fixtures
|
||
|
|
||
|
@pytest.fixture
|
||
|
def doc_store(self, monkeypatch, request) -> PineconeDocumentStore:
|
||
|
"""
|
||
|
This fixture provides an empty document store and takes care of cleaning up after each test
|
||
|
"""
|
||
|
# If it's a unit test, mock Pinecone
|
||
|
if not "integration" in request.keywords:
|
||
|
for fname, function in getmembers(pinecone_mock, isfunction):
|
||
|
monkeypatch.setattr(f"pinecone.{fname}", function, raising=False)
|
||
|
for cname, class_ in getmembers(pinecone_mock, isclass):
|
||
|
monkeypatch.setattr(f"pinecone.{cname}", class_, raising=False)
|
||
|
|
||
|
return PineconeDocumentStore(
|
||
|
api_key=os.environ.get("PINECONE_API_KEY") or "fake-pinecone-test-key",
|
||
|
embedding_dim=768,
|
||
|
embedding_field="embedding",
|
||
|
index="haystack_tests",
|
||
|
similarity="cosine",
|
||
|
recreate_index=True,
|
||
|
metadata_config={"indexed": META_FIELDS},
|
||
|
)
|
||
|
|
||
|
@pytest.fixture
|
||
|
def doc_store_with_docs(self, doc_store: PineconeDocumentStore, docs: List[Document]) -> PineconeDocumentStore:
|
||
|
"""
|
||
|
This fixture provides a pre-populated document store and takes care of cleaning up after each test
|
||
|
"""
|
||
|
doc_store.write_documents(docs)
|
||
|
return doc_store
|
||
|
|
||
|
@pytest.fixture
|
||
|
def docs_all_formats(self) -> List[Union[Document, Dict[str, Any]]]:
|
||
|
return [
|
||
|
# metafield at the top level for backward compatibility
|
||
|
{
|
||
|
"content": "My name is Paul and I live in New York",
|
||
|
"meta_field": "test-1",
|
||
|
"name": "file_1.txt",
|
||
|
"date": "2019-10-01",
|
||
|
"numeric_field": 5.0,
|
||
|
"odd_document": True,
|
||
|
},
|
||
|
# "dict" format
|
||
|
{
|
||
|
"content": "My name is Carla and I live in Berlin",
|
||
|
"meta": {
|
||
|
"meta_field": "test-2",
|
||
|
"name": "file_2.txt",
|
||
|
"date": "2020-03-01",
|
||
|
"numeric_field": 5.5,
|
||
|
"odd_document": False,
|
||
|
},
|
||
|
},
|
||
|
# Document object
|
||
|
Document(
|
||
|
content="My name is Christelle and I live in Paris",
|
||
|
meta={
|
||
|
"meta_field": "test-3",
|
||
|
"name": "file_3.txt",
|
||
|
"date": "2018-10-01",
|
||
|
"numeric_field": 4.5,
|
||
|
"odd_document": True,
|
||
|
},
|
||
|
),
|
||
|
Document(
|
||
|
content="My name is Camila and I live in Madrid",
|
||
|
meta={
|
||
|
"meta_field": "test-4",
|
||
|
"name": "file_4.txt",
|
||
|
"date": "2021-02-01",
|
||
|
"numeric_field": 3.0,
|
||
|
"odd_document": False,
|
||
|
},
|
||
|
),
|
||
|
Document(
|
||
|
content="My name is Matteo and I live in Rome",
|
||
|
meta={
|
||
|
"meta_field": "test-5",
|
||
|
"name": "file_5.txt",
|
||
|
"date": "2019-01-01",
|
||
|
"numeric_field": 0.0,
|
||
|
"odd_document": True,
|
||
|
},
|
||
|
),
|
||
|
# Without meta
|
||
|
Document(content="My name is Ahmed and I live in Cairo"),
|
||
|
]
|
||
|
|
||
|
@pytest.fixture
|
||
|
def docs(self, docs_all_formats: List[Union[Document, Dict[str, Any]]]) -> List[Document]:
|
||
|
return [Document.from_dict(doc) if isinstance(doc, dict) else doc for doc in docs_all_formats]
|
||
|
|
||
|
#
|
||
|
# Tests
|
||
|
#
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_eq(self, doc_store_with_docs: PineconeDocumentStore):
|
||
|
eq_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test-1"}})
|
||
|
normal_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": "test-1"})
|
||
|
assert eq_docs == normal_docs
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_in(self, doc_store_with_docs: PineconeDocumentStore):
|
||
|
in_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$in": ["test-1", "test-2", "n.a."]}})
|
||
|
normal_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": ["test-1", "test-2", "n.a."]})
|
||
|
assert in_docs == normal_docs
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_ne(self, doc_store_with_docs: PineconeDocumentStore):
|
||
|
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$ne": "test-1"}})
|
||
|
assert all("test-1" != d.meta.get("meta_field", None) for d in retrieved_docs)
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_nin(self, doc_store_with_docs: PineconeDocumentStore):
|
||
|
retrieved_docs = doc_store_with_docs.get_all_documents(
|
||
|
filters={"meta_field": {"$nin": ["test-1", "test-2", "n.a."]}}
|
||
|
)
|
||
|
assert {"test-1", "test-2"}.isdisjoint({d.meta.get("meta_field", None) for d in retrieved_docs})
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_gt(self, doc_store_with_docs: PineconeDocumentStore):
|
||
|
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3.0}})
|
||
|
assert all(d.meta["numeric_field"] > 3.0 for d in retrieved_docs)
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_gte(self, doc_store_with_docs: PineconeDocumentStore):
|
||
|
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3.0}})
|
||
|
assert all(d.meta["numeric_field"] >= 3.0 for d in retrieved_docs)
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_lt(self, doc_store_with_docs: PineconeDocumentStore):
|
||
|
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3.0}})
|
||
|
assert all(d.meta["numeric_field"] < 3.0 for d in retrieved_docs)
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_lte(self, doc_store_with_docs: PineconeDocumentStore):
|
||
|
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3.0}})
|
||
|
assert all(d.meta["numeric_field"] <= 3.0 for d in retrieved_docs)
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_compound_dates(self, doc_store_with_docs: PineconeDocumentStore):
|
||
|
filters = {"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"}}
|
||
|
|
||
|
with pytest.raises(FilterError, match=r"Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||
|
doc_store_with_docs.get_all_documents(filters=filters)
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_compound_dates_and_other_field_explicit(
|
||
|
self, doc_store_with_docs: PineconeDocumentStore
|
||
|
):
|
||
|
filters = {
|
||
|
"$and": {
|
||
|
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||
|
"name": {"$in": ["file_5.txt", "file_3.txt"]},
|
||
|
}
|
||
|
}
|
||
|
|
||
|
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||
|
doc_store_with_docs.get_all_documents(filters=filters)
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_compound_dates_and_other_field_simplified(
|
||
|
self, doc_store_with_docs: PineconeDocumentStore
|
||
|
):
|
||
|
filters_simplified = {
|
||
|
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||
|
"name": ["file_5.txt", "file_3.txt"],
|
||
|
}
|
||
|
|
||
|
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||
|
doc_store_with_docs.get_all_documents(filters=filters_simplified)
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_compound_dates_and_or_explicit(
|
||
|
self, doc_store_with_docs: PineconeDocumentStore
|
||
|
):
|
||
|
filters = {
|
||
|
"$and": {
|
||
|
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||
|
"$or": {"name": {"$in": ["file_5.txt", "file_3.txt"]}, "numeric_field": {"$lte": 5.0}},
|
||
|
}
|
||
|
}
|
||
|
|
||
|
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||
|
doc_store_with_docs.get_all_documents(filters=filters)
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_compound_dates_and_or_simplified(
|
||
|
self, doc_store_with_docs: PineconeDocumentStore
|
||
|
):
|
||
|
filters_simplified = {
|
||
|
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||
|
"$or": {"name": ["file_5.txt", "file_3.txt"], "numeric_field": {"$lte": 5.0}},
|
||
|
}
|
||
|
|
||
|
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||
|
doc_store_with_docs.get_all_documents(filters=filters_simplified)
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_compound_dates_and_or_and_not_explicit(
|
||
|
self, doc_store_with_docs: PineconeDocumentStore
|
||
|
):
|
||
|
filters = {
|
||
|
"$and": {
|
||
|
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||
|
"$or": {
|
||
|
"name": {"$in": ["file_5.txt", "file_3.txt"]},
|
||
|
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": {"$eq": "test-2"}}},
|
||
|
},
|
||
|
}
|
||
|
}
|
||
|
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||
|
doc_store_with_docs.get_all_documents(filters=filters)
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_compound_dates_and_or_and_not_simplified(
|
||
|
self, doc_store_with_docs: PineconeDocumentStore
|
||
|
):
|
||
|
filters_simplified = {
|
||
|
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||
|
"$or": {
|
||
|
"name": ["file_5.txt", "file_3.txt"],
|
||
|
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": "test-2"}},
|
||
|
},
|
||
|
}
|
||
|
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||
|
doc_store_with_docs.get_all_documents(filters=filters_simplified)
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_compound_nested_not(self, doc_store_with_docs: PineconeDocumentStore):
|
||
|
# Test nested logical operations within "$not", important as we apply De Morgan's laws in Weaviatedocstore
|
||
|
filters = {
|
||
|
"$not": {
|
||
|
"$or": {
|
||
|
"$and": {"numeric_field": {"$gt": 3.0}, "meta_field": {"$ne": "test-3"}},
|
||
|
"$not": {"date": {"$lt": "2020-01-01"}},
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]t' operation must be a float or int."):
|
||
|
doc_store_with_docs.get_all_documents(filters=filters)
|
||
|
|
||
|
@pytest.mark.pinecone
|
||
|
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
|
||
|
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
|
||
|
# base document store suite, and can be removed from here.
|
||
|
def test_get_all_documents_extended_filter_compound_same_level_not(
|
||
|
self, doc_store_with_docs: PineconeDocumentStore
|
||
|
):
|
||
|
# Test same logical operator twice on same level, important as we apply De Morgan's laws in Weaviatedocstore
|
||
|
filters = {
|
||
|
"$or": [
|
||
|
{"$and": {"meta_field": {"$in": ["test-1", "test-2"]}, "date": {"$gte": "2020-01-01"}}},
|
||
|
{"$and": {"meta_field": {"$in": ["test-3", "test-4"]}, "date": {"$lt": "2020-01-01"}}},
|
||
|
]
|
||
|
}
|
||
|
|
||
|
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
|
||
|
doc_store_with_docs.get_all_documents(filters=filters)
|