haystack/test/document_stores/test_pinecone.py
ZanSara 1b57b96210
refactor!: extract elasticsearch (#4668)
* extract elasticsearch

* update pyproject.toml

* make more import optional

* move MockBaseRetriever in conftest

* install es in the es integration tests
2023-04-26 10:14:20 +02:00

518 lines
22 KiB
Python

from typing import List, Union, Dict, Any
import os
import numpy as np
from inspect import getmembers, isclass, isfunction
from unittest.mock import MagicMock
import pytest
from haystack.document_stores.pinecone import PineconeDocumentStore
from haystack.schema import Document
from haystack.errors import FilterError, PineconeDocumentStoreError
from haystack.testing import DocumentStoreBaseTestAbstract
from ..mocks import pinecone as pinecone_mock
from ..conftest import MockBaseRetriever
# Set metadata fields used during testing for PineconeDocumentStore meta_config
META_FIELDS = ["meta_field", "name", "date", "numeric_field", "odd_document"]
class TestPineconeDocumentStore(DocumentStoreBaseTestAbstract):
# Fixtures
@pytest.fixture
def ds(self, monkeypatch, request) -> PineconeDocumentStore:
"""
This fixture provides an empty document store and takes care of cleaning up after each test
"""
# If it's a unit test, mock Pinecone
if request.config.getoption("--mock-pinecone"):
for fname, function in getmembers(pinecone_mock, isfunction):
monkeypatch.setattr(f"pinecone.{fname}", function, raising=False)
for cname, class_ in getmembers(pinecone_mock, isclass):
monkeypatch.setattr(f"pinecone.{cname}", class_, raising=False)
return PineconeDocumentStore(
api_key=os.environ.get("PINECONE_API_KEY") or "fake-pinecone-test-key",
embedding_dim=768,
embedding_field="embedding",
index="haystack_tests",
similarity="cosine",
recreate_index=True,
metadata_config={"indexed": META_FIELDS},
)
@pytest.fixture
def doc_store_with_docs(self, ds: PineconeDocumentStore, documents: List[Document]) -> PineconeDocumentStore:
"""
This fixture provides a pre-populated document store and takes care of cleaning up after each test
"""
ds.write_documents(documents)
return ds
@pytest.fixture
def docs_all_formats(self) -> List[Union[Document, Dict[str, Any]]]:
return [
# metafield at the top level for backward compatibility
{
"content": "My name is Paul and I live in New York",
"meta_field": "test-1",
"name": "file_1.txt",
"date": "2019-10-01",
"numeric_field": 5.0,
"odd_document": True,
"year": "2021",
"month": "02",
},
# "dict" format
{
"content": "My name is Carla and I live in Berlin",
"meta": {
"meta_field": "test-2",
"name": "file_2.txt",
"date": "2020-03-01",
"numeric_field": 5.5,
"odd_document": False,
"year": "2021",
"month": "02",
},
},
# Document object
Document(
content="My name is Christelle and I live in Paris",
meta={
"meta_field": "test-3",
"name": "file_3.txt",
"date": "2018-10-01",
"numeric_field": 4.5,
"odd_document": True,
"year": "2020",
"month": "02",
},
),
Document(
content="My name is Camila and I live in Madrid",
meta={
"meta_field": "test-4",
"name": "file_4.txt",
"date": "2021-02-01",
"numeric_field": 3.0,
"odd_document": False,
"year": "2020",
},
),
Document(
content="My name is Matteo and I live in Rome",
meta={
"meta_field": "test-5",
"name": "file_5.txt",
"date": "2019-01-01",
"numeric_field": 0.0,
"odd_document": True,
"year": "2020",
},
),
Document(
content="My name is Adele and I live in London",
meta={
"meta_field": "test-5",
"name": "file_5.txt",
"date": "2019-01-01",
"numeric_field": 0.0,
"odd_document": True,
"year": "2021",
},
),
# Without meta
Document(content="My name is Ahmed and I live in Cairo"),
Document(content="My name is Bruce and I live in Gotham"),
Document(content="My name is Peter and I live in Quahog"),
]
@pytest.fixture
def documents(self, docs_all_formats: List[Union[Document, Dict[str, Any]]]) -> List[Document]:
return [Document.from_dict(doc) if isinstance(doc, dict) else doc for doc in docs_all_formats]
#
# Tests
#
@pytest.mark.integration
def test_doc_store_wrong_init(self):
"""
This is just a failure check case.
"""
try:
_ = PineconeDocumentStore(
api_key=os.environ.get("PINECONE_API_KEY") or "fake-pinecone-test-key",
embedding_dim=768,
pinecone_index="p_index",
embedding_field="embedding",
index="haystack_tests",
similarity="cosine",
metadata_config={"indexed": META_FIELDS},
)
assert False
except PineconeDocumentStoreError as pe:
assert "`pinecone_index` needs to be a `pinecone.Index` object" in pe.message
@pytest.mark.integration
def test_ne_filters(self, ds, documents):
ds.write_documents(documents)
result = ds.get_all_documents(filters={"year": {"$ne": "2020"}})
assert len(result) == 3
@pytest.mark.integration
def test_get_label_count(self, ds, labels):
with pytest.raises(NotImplementedError):
ds.get_label_count()
# NOTE: the PineconeDocumentStore behaves differently to the others when filters are applied.
# While this should be considered a bug, the relative tests are skipped in the meantime
@pytest.mark.skip
@pytest.mark.integration
def test_compound_filters(self, ds, documents):
pass
@pytest.mark.skip
@pytest.mark.integration
def test_nin_filters(self, ds, documents):
pass
@pytest.mark.skip
@pytest.mark.integration
def test_ne_filters(self, ds, documents):
pass
@pytest.mark.skip
@pytest.mark.integration
def test_nin_filters(self, ds, documents):
pass
@pytest.mark.skip
@pytest.mark.integration
def test_comparison_filters(self, ds, documents):
pass
@pytest.mark.skip
@pytest.mark.integration
def test_nested_condition_filters(self, ds, documents):
pass
@pytest.mark.skip
@pytest.mark.integration
def test_nested_condition_not_filters(self, ds, documents):
pass
@pytest.mark.skip
@pytest.mark.integration
def test_delete_documents_by_id_with_filters(self, ds, documents):
pass
# NOTE: labels metadata are not supported
@pytest.mark.skip
@pytest.mark.integration
def test_delete_labels_by_filter(self, ds, labels):
pass
@pytest.mark.skip
@pytest.mark.integration
def test_delete_labels_by_filter_id(self, ds, labels):
pass
@pytest.mark.skip
@pytest.mark.integration
def test_simplified_filters(self, ds, documents):
pass
@pytest.mark.skip(reason="labels metadata are not supported")
@pytest.mark.integration
def test_labels_with_long_texts(self):
pass
@pytest.mark.skip(reason="labels metadata are not supported")
@pytest.mark.integration
def test_multilabel(self):
pass
@pytest.mark.skip(reason="labels metadata are not supported")
@pytest.mark.integration
def test_multilabel_no_answer(self):
pass
@pytest.mark.skip(reason="labels metadata are not supported")
@pytest.mark.integration
def test_multilabel_filter_aggregations(self):
pass
@pytest.mark.skip(reason="labels metadata are not supported")
@pytest.mark.integration
def test_multilabel_meta_aggregations(self):
pass
# NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature,
# the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the
# base document store suite, and can be removed from here.
@pytest.mark.integration
def test_get_all_documents_extended_filter_eq(self, doc_store_with_docs: PineconeDocumentStore):
eq_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test-1"}})
normal_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": "test-1"})
assert eq_docs == normal_docs
@pytest.mark.integration
def test_get_all_documents_extended_filter_in(self, doc_store_with_docs: PineconeDocumentStore):
in_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$in": ["test-1", "test-2", "n.a."]}})
normal_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": ["test-1", "test-2", "n.a."]})
assert in_docs == normal_docs
@pytest.mark.integration
def test_get_all_documents_extended_filter_ne(self, doc_store_with_docs: PineconeDocumentStore):
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$ne": "test-1"}})
assert all("test-1" != d.meta.get("meta_field", None) for d in retrieved_docs)
@pytest.mark.integration
def test_get_all_documents_extended_filter_nin(self, doc_store_with_docs: PineconeDocumentStore):
retrieved_docs = doc_store_with_docs.get_all_documents(
filters={"meta_field": {"$nin": ["test-1", "test-2", "n.a."]}}
)
assert {"test-1", "test-2"}.isdisjoint({d.meta.get("meta_field", None) for d in retrieved_docs})
@pytest.mark.integration
def test_get_all_documents_extended_filter_gt(self, doc_store_with_docs: PineconeDocumentStore):
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3.0}})
assert all(d.meta["numeric_field"] > 3.0 for d in retrieved_docs)
@pytest.mark.integration
def test_get_all_documents_extended_filter_gte(self, doc_store_with_docs: PineconeDocumentStore):
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3.0}})
assert all(d.meta["numeric_field"] >= 3.0 for d in retrieved_docs)
@pytest.mark.integration
def test_get_all_documents_extended_filter_lt(self, doc_store_with_docs: PineconeDocumentStore):
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3.0}})
assert all(d.meta["numeric_field"] < 3.0 for d in retrieved_docs)
@pytest.mark.integration
def test_get_all_documents_extended_filter_lte(self, doc_store_with_docs: PineconeDocumentStore):
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3.0}})
assert all(d.meta["numeric_field"] <= 3.0 for d in retrieved_docs)
@pytest.mark.integration
def test_get_all_documents_extended_filter_compound_dates(self, doc_store_with_docs: PineconeDocumentStore):
filters = {"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"}}
with pytest.raises(FilterError, match=r"Comparison value for '\$[l|g]te' operation must be a float or int."):
doc_store_with_docs.get_all_documents(filters=filters)
@pytest.mark.integration
def test_get_all_documents_extended_filter_compound_dates_and_other_field_explicit(
self, doc_store_with_docs: PineconeDocumentStore
):
filters = {
"$and": {
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"name": {"$in": ["file_5.txt", "file_3.txt"]},
}
}
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
doc_store_with_docs.get_all_documents(filters=filters)
@pytest.mark.integration
def test_get_all_documents_extended_filter_compound_dates_and_other_field_simplified(
self, doc_store_with_docs: PineconeDocumentStore
):
filters_simplified = {
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"name": ["file_5.txt", "file_3.txt"],
}
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
doc_store_with_docs.get_all_documents(filters=filters_simplified)
@pytest.mark.integration
def test_get_all_documents_extended_filter_compound_dates_and_or_explicit(
self, doc_store_with_docs: PineconeDocumentStore
):
filters = {
"$and": {
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {"name": {"$in": ["file_5.txt", "file_3.txt"]}, "numeric_field": {"$lte": 5.0}},
}
}
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
doc_store_with_docs.get_all_documents(filters=filters)
@pytest.mark.integration
def test_get_all_documents_extended_filter_compound_dates_and_or_simplified(
self, doc_store_with_docs: PineconeDocumentStore
):
filters_simplified = {
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {"name": ["file_5.txt", "file_3.txt"], "numeric_field": {"$lte": 5.0}},
}
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
doc_store_with_docs.get_all_documents(filters=filters_simplified)
@pytest.mark.integration
def test_get_all_documents_extended_filter_compound_dates_and_or_and_not_explicit(
self, doc_store_with_docs: PineconeDocumentStore
):
filters = {
"$and": {
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {
"name": {"$in": ["file_5.txt", "file_3.txt"]},
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": {"$eq": "test-2"}}},
},
}
}
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
doc_store_with_docs.get_all_documents(filters=filters)
@pytest.mark.integration
def test_get_all_documents_extended_filter_compound_dates_and_or_and_not_simplified(
self, doc_store_with_docs: PineconeDocumentStore
):
filters_simplified = {
"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {
"name": ["file_5.txt", "file_3.txt"],
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": "test-2"}},
},
}
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
doc_store_with_docs.get_all_documents(filters=filters_simplified)
@pytest.mark.integration
def test_get_all_documents_extended_filter_compound_nested_not(self, doc_store_with_docs: PineconeDocumentStore):
# Test nested logical operations within "$not", important as we apply De Morgan's laws in Weaviatedocstore
filters = {
"$not": {
"$or": {
"$and": {"numeric_field": {"$gt": 3.0}, "meta_field": {"$ne": "test-3"}},
"$not": {"date": {"$lt": "2020-01-01"}},
}
}
}
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]t' operation must be a float or int."):
doc_store_with_docs.get_all_documents(filters=filters)
@pytest.mark.integration
def test_get_all_documents_extended_filter_compound_same_level_not(
self, doc_store_with_docs: PineconeDocumentStore
):
# Test same logical operator twice on same level, important as we apply De Morgan's laws in Weaviatedocstore
filters = {
"$or": [
{"$and": {"meta_field": {"$in": ["test-1", "test-2"]}, "date": {"$gte": "2020-01-01"}}},
{"$and": {"meta_field": {"$in": ["test-3", "test-4"]}, "date": {"$lt": "2020-01-01"}}},
]
}
with pytest.raises(FilterError, match="Comparison value for '\$[l|g]te' operation must be a float or int."):
doc_store_with_docs.get_all_documents(filters=filters)
@pytest.mark.integration
def test_multilayer_dict(self, doc_store_with_docs: PineconeDocumentStore):
# Test that multilayer dict can be upserted
multilayer_meta = {
"parent1": {"parent2": {"parent3": {"child1": 1, "child2": 2}}},
"meta_field": "multilayer-test",
}
doc = Document(
content=f"Multilayered dict", meta=multilayer_meta, embedding=np.random.rand(768).astype(np.float32)
)
doc_store_with_docs.write_documents([doc])
retrieved_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "multilayer-test"}})
assert len(retrieved_docs) == 1
assert retrieved_docs[0].meta == multilayer_meta
@pytest.mark.unit
def test_skip_validating_empty_embeddings(self, ds: PineconeDocumentStore):
document = Document(id="0", content="test")
retriever = MockBaseRetriever(document_store=ds, mock_document=document)
ds.write_documents(documents=[document])
ds._validate_embeddings_shape = MagicMock()
ds.update_embeddings(retriever)
ds._validate_embeddings_shape.assert_called_once()
ds.update_embeddings(retriever, update_existing_embeddings=False)
ds._validate_embeddings_shape.assert_called_once()
@pytest.mark.integration
def test_get_embedding_count(self, doc_store_with_docs: PineconeDocumentStore):
"""
We expect 1 doc with an embeddings because all documents in already written in doc_store_with_docs contain no
embeddings.
"""
doc = Document(content=f"Doc with embedding", embedding=np.random.rand(768).astype(np.float32))
doc_store_with_docs.write_documents([doc])
assert doc_store_with_docs.get_embedding_count() == 1
@pytest.mark.unit
def test_get_all_labels_legacy_document_id(self, ds, monkeypatch):
monkeypatch.setattr(
ds,
"get_all_documents",
MagicMock(
return_value=[
Document.from_dict(
{
"content": "My name is Carla and I live in Berlin",
"content_type": "text",
"score": None,
"meta": {
"label-id": "d9256445-7b8a-4a33-a558-402ec84d6881",
"query": "query_1",
"label-is-correct-answer": False,
"label-is-correct-document": True,
"label-document-content": "My name is Carla and I live in Berlin",
"label-document-id": "a0747b83aea0b60c4b114b15476dd32d",
"label-no-answer": False,
"label-origin": "user-feedback",
"label-created-at": "2023-02-07 14:46:54",
"label-updated-at": None,
"label-pipeline-id": None,
"label-document-meta-meta_field": "test-2",
"label-document-meta-name": "file_2.txt",
"label-document-meta-date": "2020-03-01",
"label-document-meta-numeric_field": 5.5,
"label-document-meta-odd_document": False,
"label-document-meta-year": "2021",
"label-document-meta-month": "02",
"label-meta-name": "label_1",
"label-meta-year": "2021",
"label-answer-answer": "the answer is 1",
"label-answer-type": "extractive",
"label-answer-score": None,
"label-answer-context": None,
# legacy document_id answer
"label-answer-document-id": "a0747b83aea0b60c4b114b15476dd32d",
"label-answer-offsets-in-document-start": None,
"label-answer-offsets-in-document-end": None,
"label-answer-offsets-in-context-start": None,
"label-answer-offsets-in-context-end": None,
},
"id_hash_keys": ["content"],
"embedding": None,
"id": "d9256445-7b8a-4a33-a558-402ec84d6881",
}
)
]
),
)
labels = ds.get_all_labels()
assert labels[0].answer.document_ids == ["a0747b83aea0b60c4b114b15476dd32d"]