feat: Pinecone document store refactoring (#5725)

* Refactor codebase so that doc_type metadata is used instead of namespaces for making distinction between documents without embeddings, documents with embeddings and labels

* Fix parameter name in integration test

* Remove code under comment in add_type_metadata_filter method

* Fix mypy and pylint checks

* Add release note

* Apply minimal changes: rename method, update method docs and remove redundant method

* Mypy fixes

* Fix docstrings

* Revert helper methods for fetching documents when the number of documents exceeds Pinecone limit

* Remove unnecessary attributes in PineconeDocumentStore

* Fix unit test

---------

Co-authored-by: Ivana Zeljkovic <ivana.zeljkovic@smartcat.io>
Co-authored-by: DosticJelena <jelena.dostic@smartcat.io>
This commit is contained in:
Ivana Zeljkovic 2023-09-14 11:46:47 +02:00 committed by GitHub
parent beb8853412
commit 4bad202197
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 441 additions and 336 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,6 @@
---
enhancements:
- |
Refactor PineconeDocumentStore to use metadata instead of namespaces
for distinction between documents with embeddings, documents without
embeddings and labels

View File

@ -1,20 +1,18 @@
from typing import List, Union, Dict, Any
import os
import numpy as np
from inspect import getmembers, isclass, isfunction
from typing import Any, Dict, List, Union
from unittest.mock import MagicMock
import numpy as np
import pytest
from haystack.document_stores.pinecone import pinecone
from haystack.document_stores.pinecone import PineconeDocumentStore
from haystack.schema import Document
from haystack.document_stores.pinecone import DOCUMENT_WITH_EMBEDDING, PineconeDocumentStore, pinecone
from haystack.errors import FilterError, PineconeDocumentStoreError
from haystack.schema import Document
from haystack.testing import DocumentStoreBaseTestAbstract
from ..mocks import pinecone as pinecone_mock
from ..conftest import MockBaseRetriever
from ..mocks import pinecone as pinecone_mock
# Set metadata fields used during testing for PineconeDocumentStore meta_config
META_FIELDS = ["meta_field", "name", "date", "numeric_field", "odd_document"]
@ -150,6 +148,7 @@ class TestPineconeDocumentStore(DocumentStoreBaseTestAbstract):
#
# Tests
#
@pytest.mark.integration
def test_doc_store_wrong_init(self):
"""
@ -545,7 +544,8 @@ class TestPineconeDocumentStore(DocumentStoreBaseTestAbstract):
assert doc_store_with_docs.get_document_count() == initial_document_count + 2
# remove one of the documents with embedding
all_embedding_docs = doc_store_with_docs.get_all_documents(namespace="vectors")
all_embedding_docs = doc_store_with_docs.get_all_documents(type_metadata=DOCUMENT_WITH_EMBEDDING)
doc_store_with_docs.delete_documents(ids=[all_embedding_docs[0].id])
# since we deleted one doc, we expect initial_document_count + 1 documents in total
@ -577,7 +577,7 @@ class TestPineconeDocumentStore(DocumentStoreBaseTestAbstract):
assert doc_store_with_docs.get_document_count() == initial_document_count + 2
# remove one of the documents without embedding
all_non_embedding_docs = doc_store_with_docs.get_all_documents(namespace="no-vectors")
all_non_embedding_docs = doc_store_with_docs.get_all_documents(type_metadata="no-vector")
doc_store_with_docs.delete_documents(ids=[all_non_embedding_docs[0].id])
# since we deleted one doc, we expect initial_document_count + 1 documents in total
@ -656,6 +656,7 @@ class TestPineconeDocumentStore(DocumentStoreBaseTestAbstract):
mocked_ds.write_documents([doc])
call_args = mocked_ds.pinecone_indexes["document"].upsert.call_args.kwargs
assert list(call_args["vectors"])[0][2] == {
"doc_type": "no-vector",
"content": "test",
"content_type": "text",
"_split_overlap": '[{"doc_id": "test_id", "range": [0, 10]}]',