From a03e8335aa5e0ecc3fc77d519dff54e7ed2af5cb Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 17 Apr 2023 10:40:30 +0200 Subject: [PATCH] Ignore cross-reference properties when loading documents (#4664) * drop cross-reference properties * be more defensive * fix regression --- haystack/document_stores/weaviate.py | 26 +++++++++++++++-- test/document_stores/test_weaviate.py | 41 +++++++++++++++++++++------ 2 files changed, 57 insertions(+), 10 deletions(-) diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py index ace1a0163..90c088f00 100644 --- a/haystack/document_stores/weaviate.py +++ b/haystack/document_stores/weaviate.py @@ -28,6 +28,20 @@ from haystack.nodes.retriever import DenseRetriever logger = logging.getLogger(__name__) UUID_PATTERN = re.compile(r"^[\da-f]{8}-([\da-f]{4}-){3}[\da-f]{12}$", re.IGNORECASE) +SUPPORTED_PROPERTY_TYPES = { + "string", + "string[]", + "int", + "int[]", + "boolean", + "boolean[]", + "number", + "number[]", + "date", + "date[]", + "text", + "text[]", +} class WeaviateDocumentStoreError(DocumentStoreError): @@ -391,13 +405,21 @@ class WeaviateDocumentStore(KeywordDocumentStore): def _get_current_properties(self, index: Optional[str] = None) -> List[str]: """ - Get all the existing properties in the schema. + Get all the existing properties in the schema, excluding those with complex types + like cross-reference """ index = self._sanitize_index_name(index) or self.index cur_properties = [] for class_item in self.weaviate_client.schema.get()["classes"]: if class_item["class"] == index: - cur_properties = [item["name"] for item in class_item["properties"]] + cur_properties = [ + item["name"] + for item in class_item.get("properties", []) + # dataType should be always there and contain only one item unless + # it's a cross-reference but here we try to be defensive against + # unexpected schemas + if set(item.get("dataType", [])).issubset(SUPPORTED_PROPERTY_TYPES) + ] return cur_properties diff --git a/test/document_stores/test_weaviate.py b/test/document_stores/test_weaviate.py index 3627bd254..20c5f9c0a 100644 --- a/test/document_stores/test_weaviate.py +++ b/test/document_stores/test_weaviate.py @@ -1,16 +1,14 @@ +import uuid +import json +from unittest import mock + import pytest +import numpy as np from haystack.document_stores.weaviate import WeaviateDocumentStore from haystack.schema import Document from haystack.testing import DocumentStoreBaseTestAbstract -import uuid -from unittest.mock import MagicMock - -import numpy as np -import pytest - -from haystack.schema import Document embedding_dim = 768 @@ -207,7 +205,7 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract): ds.write_documents(documents) # This test verifies that deleting an object by its ID does not first require fetching all documents. This fixes # a bug, as described in https://github.com/deepset-ai/haystack/issues/2898 - ds.get_all_documents = MagicMock(wraps=ds.get_all_documents) + ds.get_all_documents = mock.MagicMock(wraps=ds.get_all_documents) assert ds.get_document_count() == 9 @@ -258,3 +256,30 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract): """ ds.write_documents(documents) assert ds.get_embedding_count() == 9 + + @pytest.mark.unit + def test__get_current_properties(self): + with mock.patch("haystack.document_stores.weaviate.client") as mocked_client: + mocked_client.Client().is_ready.return_value = True + mocked_client.Client().schema.contains.return_value = False + mocked_client.Client().schema.get.return_value = json.loads( + """ +{ + "classes": [{ + "class": "Document", + "properties": [ + { + "name": "hasWritten", + "dataType": ["Article"] + }, + { + "name": "hitCounter", + "dataType": ["int"] + } + ] + }] +} """ + ) + ds = WeaviateDocumentStore() + # Ensure we dropped the cross-reference property + assert ds._get_current_properties() == ["hitCounter"]