import os from inspect import getmembers, isclass, isfunction from typing import Any, Dict, List, Union from unittest.mock import MagicMock import numpy as np import pytest from haystack.document_stores.pinecone import ( DOCUMENT_WITH_EMBEDDING, DOCUMENT_WITHOUT_EMBEDDING, TYPE_METADATA_FIELD, PineconeDocumentStore, pinecone, ) from haystack.errors import FilterError, PineconeDocumentStoreError from haystack.schema import Document from haystack.testing import DocumentStoreBaseTestAbstract from ..conftest import MockBaseRetriever from ..mocks import pinecone as pinecone_mock # Set metadata fields used during testing for PineconeDocumentStore meta_config META_FIELDS = ["meta_field", "name", "date", "numeric_field", "odd_document", "doc_type"] class TestPineconeDocumentStore(DocumentStoreBaseTestAbstract): # Fixtures @pytest.fixture def ds(self, monkeypatch, request) -> PineconeDocumentStore: """ This fixture provides an empty document store and takes care of cleaning up after each test """ # If it's a unit test, mock Pinecone if request.config.getoption("--mock-pinecone"): for fname, function in getmembers(pinecone_mock, isfunction): monkeypatch.setattr(f"pinecone.{fname}", function, raising=False) for cname, class_ in getmembers(pinecone_mock, isclass): monkeypatch.setattr(f"pinecone.{cname}", class_, raising=False) return PineconeDocumentStore( api_key=os.environ.get("PINECONE_API_KEY") or "fake-pinecone-test-key", embedding_dim=768, embedding_field="embedding", index="haystack_tests", similarity="cosine", recreate_index=True, metadata_config={"indexed": META_FIELDS}, ) @pytest.fixture def doc_store_with_docs(self, ds: PineconeDocumentStore, documents: List[Document]) -> PineconeDocumentStore: """ This fixture provides a pre-populated document store and takes care of cleaning up after each test """ ds.write_documents(documents) return ds @pytest.fixture def mocked_ds(self): class DSMock(PineconeDocumentStore): pass pinecone.init = MagicMock() pinecone.describe_index = MagicMock() DSMock._create_index = MagicMock() mocked_ds = DSMock(api_key="MOCK") return mocked_ds @pytest.fixture def docs_all_formats(self) -> List[Union[Document, Dict[str, Any]]]: return [ # metafield at the top level for backward compatibility { "content": "My name is Paul and I live in New York", "meta_field": "test-1", "name": "file_1.txt", "date": "2019-10-01", "numeric_field": 5.0, "odd_document": True, "year": "2021", "month": "02", }, # "dict" format { "content": "My name is Carla and I live in Berlin", "meta": { "meta_field": "test-2", "name": "file_2.txt", "date": "2020-03-01", "numeric_field": 5.5, "odd_document": False, "year": "2021", "month": "02", }, }, # Document object Document( content="My name is Christelle and I live in Paris", meta={ "meta_field": "test-3", "name": "file_3.txt", "date": "2018-10-01", "numeric_field": 4.5, "odd_document": True, "year": "2020", "month": "02", }, ), Document( content="My name is Camila and I live in Madrid", meta={ "meta_field": "test-4", "name": "file_4.txt", "date": "2021-02-01", "numeric_field": 3.0, "odd_document": False, "year": "2020", }, ), Document( content="My name is Matteo and I live in Rome", meta={ "meta_field": "test-5", "name": "file_5.txt", "date": "2019-01-01", "numeric_field": 0.0, "odd_document": True, "year": "2020", }, ), Document( content="My name is Adele and I live in London", meta={ "meta_field": "test-5", "name": "file_5.txt", "date": "2019-01-01", "numeric_field": 0.0, "odd_document": True, "year": "2021", }, ), # Without meta Document(content="My name is Ahmed and I live in Cairo"), Document(content="My name is Bruce and I live in Gotham"), Document(content="My name is Peter and I live in Quahog"), ] @pytest.fixture def documents(self, docs_all_formats: List[Union[Document, Dict[str, Any]]]) -> List[Document]: return [Document.from_dict(doc) if isinstance(doc, dict) else doc for doc in docs_all_formats] # # Tests # @pytest.mark.integration def test_doc_store_wrong_init(self): """ This is just a failure check case. """ try: _ = PineconeDocumentStore( api_key=os.environ.get("PINECONE_API_KEY") or "fake-pinecone-test-key", embedding_dim=768, pinecone_index="p_index", embedding_field="embedding", index="haystack_tests", similarity="cosine", metadata_config={"indexed": META_FIELDS}, ) assert False except PineconeDocumentStoreError as pe: assert "`pinecone_index` needs to be a `pinecone.Index` object" in pe.message @pytest.mark.integration def test_ne_filters(self, ds, documents): ds.write_documents(documents) result = ds.get_all_documents(filters={"year": {"$ne": "2020"}}) assert len(result) == 3 @pytest.mark.integration def test_get_label_count(self, ds, labels): with pytest.raises(NotImplementedError): ds.get_label_count() # NOTE: the PineconeDocumentStore behaves differently to the others when filters are applied. # While this should be considered a bug, the relative tests are skipped in the meantime @pytest.mark.skip @pytest.mark.integration def test_compound_filters(self, ds, documents): pass @pytest.mark.skip @pytest.mark.integration def test_nin_filters(self, ds, documents): pass @pytest.mark.skip @pytest.mark.integration def test_ne_filters(self, ds, documents): # noqa: F811 pass @pytest.mark.skip @pytest.mark.integration def test_nin_filters(self, ds, documents): # noqa: F811 pass @pytest.mark.skip @pytest.mark.integration def test_comparison_filters(self, ds, documents): pass @pytest.mark.skip @pytest.mark.integration def test_nested_condition_filters(self, ds, documents): pass @pytest.mark.skip @pytest.mark.integration def test_nested_condition_not_filters(self, ds, documents): pass @pytest.mark.skip @pytest.mark.integration def test_delete_documents_by_id_with_filters(self, ds, documents): pass # NOTE: labels metadata are not supported @pytest.mark.skip @pytest.mark.integration def test_delete_labels_by_filter(self, ds, labels): pass @pytest.mark.skip @pytest.mark.integration def test_delete_labels_by_filter_id(self, ds, labels): pass @pytest.mark.skip @pytest.mark.integration def test_simplified_filters(self, ds, documents): pass @pytest.mark.skip(reason="labels metadata are not supported") @pytest.mark.integration def test_labels_with_long_texts(self): pass @pytest.mark.skip(reason="labels metadata are not supported") @pytest.mark.integration def test_multilabel(self): pass @pytest.mark.skip(reason="labels metadata are not supported") @pytest.mark.integration def test_multilabel_no_answer(self): pass @pytest.mark.skip(reason="labels metadata are not supported") @pytest.mark.integration def test_multilabel_filter_aggregations(self): pass @pytest.mark.skip(reason="labels metadata are not supported") @pytest.mark.integration def test_multilabel_meta_aggregations(self): pass # NOTE: Pinecone does not support dates, so it can't do lte or gte on date fields. When a new release introduces this feature, # the entire family of test_get_all_documents_extended_filter_* tests will become identical to the one present in the # base document store suite, and can be removed from here. @pytest.mark.integration def test_get_all_documents_extended_filter_eq(self, doc_store_with_docs: PineconeDocumentStore): eq_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test-1"}}) normal_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": "test-1"}) assert eq_docs == normal_docs @pytest.mark.integration def test_get_all_documents_extended_filter_in(self, doc_store_with_docs: PineconeDocumentStore): in_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$in": ["test-1", "test-2", "n.a."]}}) normal_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": ["test-1", "test-2", "n.a."]}) assert in_docs == normal_docs @pytest.mark.integration def test_get_all_documents_extended_filter_ne(self, doc_store_with_docs: PineconeDocumentStore): retrieved_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$ne": "test-1"}}) assert all(d.meta.get("meta_field", None) != "test-1" for d in retrieved_docs) @pytest.mark.integration def test_get_all_documents_extended_filter_nin(self, doc_store_with_docs: PineconeDocumentStore): retrieved_docs = doc_store_with_docs.get_all_documents( filters={"meta_field": {"$nin": ["test-1", "test-2", "n.a."]}} ) assert {"test-1", "test-2"}.isdisjoint({d.meta.get("meta_field", None) for d in retrieved_docs}) @pytest.mark.integration def test_get_all_documents_extended_filter_gt(self, doc_store_with_docs: PineconeDocumentStore): retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3.0}}) assert all(d.meta["numeric_field"] > 3.0 for d in retrieved_docs) @pytest.mark.integration def test_get_all_documents_extended_filter_gte(self, doc_store_with_docs: PineconeDocumentStore): retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3.0}}) assert all(d.meta["numeric_field"] >= 3.0 for d in retrieved_docs) @pytest.mark.integration def test_get_all_documents_extended_filter_lt(self, doc_store_with_docs: PineconeDocumentStore): retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3.0}}) assert all(d.meta["numeric_field"] < 3.0 for d in retrieved_docs) @pytest.mark.integration def test_get_all_documents_extended_filter_lte(self, doc_store_with_docs: PineconeDocumentStore): retrieved_docs = doc_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3.0}}) assert all(d.meta["numeric_field"] <= 3.0 for d in retrieved_docs) @pytest.mark.integration def test_get_all_documents_extended_filter_compound_dates(self, doc_store_with_docs: PineconeDocumentStore): filters = {"date": {"$lte": "2020-12-31", "$gte": "2019-01-01"}} with pytest.raises(FilterError, match=r"Comparison value for '\$[l|g]te' operation must be a float or int."): doc_store_with_docs.get_all_documents(filters=filters) @pytest.mark.integration def test_get_all_documents_extended_filter_compound_dates_and_other_field_explicit( self, doc_store_with_docs: PineconeDocumentStore ): filters = { "$and": { "date": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, "name": {"$in": ["file_5.txt", "file_3.txt"]}, } } with pytest.raises(FilterError, match=r"Comparison value for '\$[l|g]te' operation must be a float or int."): doc_store_with_docs.get_all_documents(filters=filters) @pytest.mark.integration def test_get_all_documents_extended_filter_compound_dates_and_other_field_simplified( self, doc_store_with_docs: PineconeDocumentStore ): filters_simplified = { "date": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, "name": ["file_5.txt", "file_3.txt"], } with pytest.raises(FilterError, match=r"Comparison value for '\$[l|g]te' operation must be a float or int."): doc_store_with_docs.get_all_documents(filters=filters_simplified) @pytest.mark.integration def test_get_all_documents_extended_filter_compound_dates_and_or_explicit( self, doc_store_with_docs: PineconeDocumentStore ): filters = { "$and": { "date": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, "$or": {"name": {"$in": ["file_5.txt", "file_3.txt"]}, "numeric_field": {"$lte": 5.0}}, } } with pytest.raises(FilterError, match=r"Comparison value for '\$[l|g]te' operation must be a float or int."): doc_store_with_docs.get_all_documents(filters=filters) @pytest.mark.integration def test_get_all_documents_extended_filter_compound_dates_and_or_simplified( self, doc_store_with_docs: PineconeDocumentStore ): filters_simplified = { "date": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, "$or": {"name": ["file_5.txt", "file_3.txt"], "numeric_field": {"$lte": 5.0}}, } with pytest.raises(FilterError, match=r"Comparison value for '\$[l|g]te' operation must be a float or int."): doc_store_with_docs.get_all_documents(filters=filters_simplified) @pytest.mark.integration def test_get_all_documents_extended_filter_compound_dates_and_or_and_not_explicit( self, doc_store_with_docs: PineconeDocumentStore ): filters = { "$and": { "date": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, "$or": { "name": {"$in": ["file_5.txt", "file_3.txt"]}, "$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": {"$eq": "test-2"}}}, }, } } with pytest.raises(FilterError, match=r"Comparison value for '\$[l|g]te' operation must be a float or int."): doc_store_with_docs.get_all_documents(filters=filters) @pytest.mark.integration def test_get_all_documents_extended_filter_compound_dates_and_or_and_not_simplified( self, doc_store_with_docs: PineconeDocumentStore ): filters_simplified = { "date": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, "$or": { "name": ["file_5.txt", "file_3.txt"], "$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": "test-2"}}, }, } with pytest.raises(FilterError, match=r"Comparison value for '\$[l|g]te' operation must be a float or int."): doc_store_with_docs.get_all_documents(filters=filters_simplified) @pytest.mark.integration def test_get_all_documents_extended_filter_compound_nested_not(self, doc_store_with_docs: PineconeDocumentStore): # Test nested logical operations within "$not", important as we apply De Morgan's laws in Weaviatedocstore filters = { "$not": { "$or": { "$and": {"numeric_field": {"$gt": 3.0}, "meta_field": {"$ne": "test-3"}}, "$not": {"date": {"$lt": "2020-01-01"}}, } } } with pytest.raises(FilterError, match=r"Comparison value for '\$[l|g]t' operation must be a float or int."): doc_store_with_docs.get_all_documents(filters=filters) @pytest.mark.integration def test_get_all_documents_extended_filter_compound_same_level_not( self, doc_store_with_docs: PineconeDocumentStore ): # Test same logical operator twice on same level, important as we apply De Morgan's laws in Weaviatedocstore filters = { "$or": [ {"$and": {"meta_field": {"$in": ["test-1", "test-2"]}, "date": {"$gte": "2020-01-01"}}}, {"$and": {"meta_field": {"$in": ["test-3", "test-4"]}, "date": {"$lt": "2020-01-01"}}}, ] } with pytest.raises(FilterError, match=r"Comparison value for '\$[l|g]te' operation must be a float or int."): doc_store_with_docs.get_all_documents(filters=filters) @pytest.mark.integration def test_multilayer_dict(self, doc_store_with_docs: PineconeDocumentStore): # Test that multilayer dict can be upserted multilayer_meta = { "parent1": {"parent2": {"parent3": {"child1": 1, "child2": 2}}}, "meta_field": "multilayer-test", } doc = Document( content="Multilayered dict", meta=multilayer_meta, embedding=np.random.rand(768).astype(np.float32) ) doc_store_with_docs.write_documents([doc]) retrieved_docs = doc_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "multilayer-test"}}) assert len(retrieved_docs) == 1 assert retrieved_docs[0].meta == multilayer_meta @pytest.mark.unit def test_skip_validating_empty_embeddings(self, ds: PineconeDocumentStore): document = Document(id="0", content="test") retriever = MockBaseRetriever(document_store=ds, mock_document=document) ds.write_documents(documents=[document]) ds._validate_embeddings_shape = MagicMock() ds.update_embeddings(retriever) ds._validate_embeddings_shape.assert_called_once() ds.update_embeddings(retriever, update_existing_embeddings=False) ds._validate_embeddings_shape.assert_called_once() @pytest.mark.integration def test_get_embedding_count(self, doc_store_with_docs: PineconeDocumentStore): """ We expect 1 doc with an embeddings because all documents in already written in doc_store_with_docs contain no embeddings. """ doc = Document( content="Doc with embedding", embedding=np.random.rand(768).astype(np.float32), meta={"meta_field": "test-1"}, ) doc_store_with_docs.write_documents([doc]) assert doc_store_with_docs.get_embedding_count() == 1 @pytest.mark.integration def test_get_embedding_count_with_filters(self, doc_store_with_docs: PineconeDocumentStore): """ We expect 1 doc with an embedding and given filters, because there are only two documents with embedding written in doc_store_with_docs, while only one of them satisfies given filters. """ doc_1 = Document( content="Doc with embedding 1", embedding=np.random.rand(768).astype(np.float32), meta={"meta_field": "test-1"}, ) doc_2 = Document( content="Doc with embedding 2", embedding=np.random.rand(768).astype(np.float32), meta={"meta_field": "test-2"}, ) doc_store_with_docs.write_documents([doc_1, doc_2]) assert doc_store_with_docs.get_embedding_count(filters={"meta_field": "test-1"}) == 1 @pytest.mark.integration def test_get_embedding_count_with_doc_type_filters(self, doc_store_with_docs: PineconeDocumentStore): """ We expect 2 docs with an embedding and given filters, because there are only two documents with embedding written in doc_store_with_docs and both of them satisfy given filters (`meta_field` filter). Even though the filters include `doc_type` with value related to documents without embedding (`no-vector`), we expect this particular filter to be ignored (irrelevant, since documents with embedding have `doc_type` set to `vector`). """ doc_1 = Document( content="Doc with embedding 1", embedding=np.random.rand(768).astype(np.float32), meta={"meta_field": "test-2"}, ) doc_2 = Document( content="Doc with embedding 2", embedding=np.random.rand(768).astype(np.float32), meta={"meta_field": "test-2"}, ) doc_store_with_docs.write_documents([doc_1, doc_2]) assert ( doc_store_with_docs.get_embedding_count( filters={TYPE_METADATA_FIELD: DOCUMENT_WITHOUT_EMBEDDING, "meta_field": "test-2"} ) == 2 ) @pytest.mark.integration def test_get_document_count_after_write_doc_with_embedding(self, doc_store_with_docs: PineconeDocumentStore): """ Tests that get_document_count() returns the correct number of documents in the document store after a document with an embedding is written to the document store. """ # there are 9 docs in doc_store_with_docs (all without embeddings) initial_document_count = 9 # we expect initial_document_count documents without embeddings in doc_store_with_docs assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count # and also initial_document_count documents in total assert doc_store_with_docs.get_document_count() == initial_document_count # document with embedding is written to doc_store_with_docs doc = Document(content="Doc with embedding", embedding=np.random.rand(768).astype(np.float32)) doc_store_with_docs.write_documents([doc]) # so we expect initial_document_count + 1 documents in total assert doc_store_with_docs.get_document_count() == initial_document_count + 1 # but we expect initial_document_count documents without embeddings to be unchanged assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count @pytest.mark.integration def test_get_document_count_after_write_doc_without_embedding(self, doc_store_with_docs: PineconeDocumentStore): """ Tests that get_document_count() returns the correct number of documents in the document store after a document without an embedding is written to the document store. """ # there are 9 docs in doc_store_with_docs (all without embeddings) initial_document_count = 9 # we expect initial_document_count documents without embeddings in doc_store_with_docs assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count # and we also expect initial_document_count documents in total assert doc_store_with_docs.get_document_count() == initial_document_count # document without embedding is written to doc_store_with_docs doc = Document(content="Doc without embedding") doc_store_with_docs.write_documents([doc]) # we now expect initial_document_count + 1 documents in total assert doc_store_with_docs.get_document_count() == initial_document_count + 1 # And we also expect initial_document_count + 1 documents without embeddings, because the document we just # wrote has no embeddings assert ( doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count + 1 ) @pytest.mark.integration def test_get_document_count_after_delete_doc_with_embedding(self, doc_store_with_docs: PineconeDocumentStore): """ Tests that get_document_count() returns the correct number of documents in the document store after a document with an embedding is deleted from the document store. """ # there are 9 docs in doc_store_with_docs (all without embeddings) initial_document_count = 9 # we expect initial_document_count documents without embeddings in doc_store_with_docs assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count # and also initial_document_count documents in total assert doc_store_with_docs.get_document_count() == initial_document_count # two documents with embedding are written to doc_store_with_docs doc_1 = Document(content="Doc with embedding 1", embedding=np.random.rand(768).astype(np.float32)) doc_2 = Document(content="Doc with embedding 2", embedding=np.random.rand(768).astype(np.float32)) doc_store_with_docs.write_documents([doc_1, doc_2]) # total number is initial_document_count + 2 assert doc_store_with_docs.get_document_count() == initial_document_count + 2 # remove one of the documents with embedding all_embedding_docs = doc_store_with_docs.get_all_documents(type_metadata=DOCUMENT_WITH_EMBEDDING) doc_store_with_docs.delete_documents(ids=[all_embedding_docs[0].id]) # since we deleted one doc, we expect initial_document_count + 1 documents in total assert doc_store_with_docs.get_document_count() == initial_document_count + 1 # and we expect initial_document_count documents without embeddings assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count @pytest.mark.integration def test_get_document_count_after_delete_doc_without_embedding(self, doc_store_with_docs: PineconeDocumentStore): """ Tests that get_document_count() returns the correct number of documents in the document store after a document without embedding is deleted from the document store. """ # there are 9 docs in doc_store_with_docs (all without embeddings) initial_document_count = 9 # therefore we expect initial_document_count documents without embeddings in doc_store_with_docs assert doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count # and also initial_document_count documents in total assert doc_store_with_docs.get_document_count() == initial_document_count # two documents without embedding are written to doc_store_with_docs doc_1 = Document(content="Doc with embedding 1", embedding=None) doc_2 = Document(content="Doc with embedding 2", embedding=None) doc_store_with_docs.write_documents([doc_1, doc_2]) # total number is initial_document_count + 2 assert doc_store_with_docs.get_document_count() == initial_document_count + 2 # remove one of the documents without embedding all_non_embedding_docs = doc_store_with_docs.get_all_documents(type_metadata="no-vector") doc_store_with_docs.delete_documents(ids=[all_non_embedding_docs[0].id]) # since we deleted one doc, we expect initial_document_count + 1 documents in total assert doc_store_with_docs.get_document_count() == initial_document_count + 1 # and we expect initial_document_count +1 documents without embeddings as well assert ( doc_store_with_docs.get_document_count(only_documents_without_embedding=True) == initial_document_count + 1 ) @pytest.mark.unit def test_get_all_labels_legacy_document_id(self, ds, monkeypatch): monkeypatch.setattr( ds, "get_all_documents", MagicMock( return_value=[ Document.from_dict( { "content": "My name is Carla and I live in Berlin", "content_type": "text", "score": None, "meta": { "label-id": "d9256445-7b8a-4a33-a558-402ec84d6881", "query": "query_1", "label-is-correct-answer": False, "label-is-correct-document": True, "label-document-content": "My name is Carla and I live in Berlin", "label-document-id": "a0747b83aea0b60c4b114b15476dd32d", "label-no-answer": False, "label-origin": "user-feedback", "label-created-at": "2023-02-07 14:46:54", "label-updated-at": None, "label-pipeline-id": None, "label-document-meta-meta_field": "test-2", "label-document-meta-name": "file_2.txt", "label-document-meta-date": "2020-03-01", "label-document-meta-numeric_field": 5.5, "label-document-meta-odd_document": False, "label-document-meta-year": "2021", "label-document-meta-month": "02", "label-meta-name": "label_1", "label-meta-year": "2021", "label-answer-answer": "the answer is 1", "label-answer-type": "extractive", "label-answer-score": None, "label-answer-context": None, # legacy document_id answer "label-answer-document-id": "a0747b83aea0b60c4b114b15476dd32d", "label-answer-offsets-in-document-start": None, "label-answer-offsets-in-document-end": None, "label-answer-offsets-in-context-start": None, "label-answer-offsets-in-context-end": None, }, "id_hash_keys": ["content"], "embedding": None, "id": "d9256445-7b8a-4a33-a558-402ec84d6881", } ) ] ), ) labels = ds.get_all_labels() assert labels[0].answer.document_ids == ["a0747b83aea0b60c4b114b15476dd32d"] @pytest.mark.unit def test_split_overlap_meta(self, mocked_ds): """ Tests that we can upload Docs with a _split_overlap_meta field to Pinecone as a JSON string and that the field is parsed correctly as dictionary when retrieved. """ doc = Document(content="test", meta={"_split_overlap": [{"doc_id": "test_id", "range": (0, 10)}]}, id="test_id") # Test writing as JSON string mocked_ds.write_documents([doc]) call_args = mocked_ds.pinecone_indexes["document"].upsert.call_args.kwargs assert list(call_args["vectors"])[0][2] == { "doc_type": "no-vector", "content": "test", "content_type": "text", "_split_overlap": '[{"doc_id": "test_id", "range": [0, 10]}]', } # Test retrieving as dict mocked_ds._get_all_document_ids = MagicMock(return_value=["test_id"]) mocked_ds.pinecone_indexes["document"].fetch.return_value = { "vectors": { "test_id": { "metadata": { "_split_overlap": '[{"doc_id": "test_id", "range": [0, 10]}]', "content": "test", "content_type": "text", } } } } retrieved_docs = mocked_ds.get_all_documents() assert retrieved_docs[0].meta["_split_overlap"] == [{"doc_id": "test_id", "range": [0, 10]}]