Ignore cross-reference properties when loading documents (#4664)

* drop cross-reference properties * be more defensive * fix regression
2026-01-07 12:37:27 +00:00 · 2023-04-17 10:40:30 +02:00 · 2023-04-17 10:40:30 +02:00 · a03e8335aa
commit a03e8335aa
parent dbe3049682
2 changed files with 57 additions and 10 deletions
--- a/haystack/document_stores/weaviate.py
+++ b/haystack/document_stores/weaviate.py
@ -28,6 +28,20 @@ from haystack.nodes.retriever import DenseRetriever

 logger = logging.getLogger(__name__)
 UUID_PATTERN = re.compile(r"^[\da-f]{8}-([\da-f]{4}-){3}[\da-f]{12}$", re.IGNORECASE)
+SUPPORTED_PROPERTY_TYPES = {
+    "string",
+    "string[]",
+    "int",
+    "int[]",
+    "boolean",
+    "boolean[]",
+    "number",
+    "number[]",
+    "date",
+    "date[]",
+    "text",
+    "text[]",
+}


 class WeaviateDocumentStoreError(DocumentStoreError):
@ -391,13 +405,21 @@ class WeaviateDocumentStore(KeywordDocumentStore):

    def _get_current_properties(self, index: Optional[str] = None) -> List[str]:
        """
-        Get all the existing properties in the schema.
+        Get all the existing properties in the schema, excluding those with complex types
+        like cross-reference
        """
        index = self._sanitize_index_name(index) or self.index
        cur_properties = []
        for class_item in self.weaviate_client.schema.get()["classes"]:
            if class_item["class"] == index:
-                cur_properties = [item["name"] for item in class_item["properties"]]
+                cur_properties = [
+                    item["name"]
+                    for item in class_item.get("properties", [])
+                    # dataType should be always there and contain only one item unless
+                    # it's a cross-reference but here we try to be defensive against
+                    # unexpected schemas
+                    if set(item.get("dataType", [])).issubset(SUPPORTED_PROPERTY_TYPES)
+                ]

        return cur_properties

--- a/test/document_stores/test_weaviate.py
+++ b/test/document_stores/test_weaviate.py
@ -1,16 +1,14 @@
+import uuid
+import json
+from unittest import mock
+
 import pytest
+import numpy as np

 from haystack.document_stores.weaviate import WeaviateDocumentStore
 from haystack.schema import Document
 from haystack.testing import DocumentStoreBaseTestAbstract

-import uuid
-from unittest.mock import MagicMock
-
-import numpy as np
-import pytest
-
-from haystack.schema import Document

 embedding_dim = 768

@ -207,7 +205,7 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract):
        ds.write_documents(documents)
        # This test verifies that deleting an object by its ID does not first require fetching all documents. This fixes
        # a bug, as described in https://github.com/deepset-ai/haystack/issues/2898
-        ds.get_all_documents = MagicMock(wraps=ds.get_all_documents)
+        ds.get_all_documents = mock.MagicMock(wraps=ds.get_all_documents)

        assert ds.get_document_count() == 9

@ -258,3 +256,30 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract):
        """
        ds.write_documents(documents)
        assert ds.get_embedding_count() == 9
+
+    @pytest.mark.unit
+    def test__get_current_properties(self):
+        with mock.patch("haystack.document_stores.weaviate.client") as mocked_client:
+            mocked_client.Client().is_ready.return_value = True
+            mocked_client.Client().schema.contains.return_value = False
+            mocked_client.Client().schema.get.return_value = json.loads(
+                """
+{
+  "classes": [{
+    "class": "Document",
+    "properties": [
+        {
+        "name": "hasWritten",
+        "dataType": ["Article"]
+        },
+        {
+        "name": "hitCounter",
+        "dataType": ["int"]
+        }
+    ]
+  }]
+} """
+            )
+            ds = WeaviateDocumentStore()
+            # Ensure we dropped the cross-reference property
+            assert ds._get_current_properties() == ["hitCounter"]