Ignore cross-reference properties when loading documents (#4664)

* drop cross-reference properties

* be more defensive

* fix regression
This commit is contained in:
Massimiliano Pippi 2023-04-17 10:40:30 +02:00 committed by GitHub
parent dbe3049682
commit a03e8335aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 57 additions and 10 deletions

View File

@ -28,6 +28,20 @@ from haystack.nodes.retriever import DenseRetriever
logger = logging.getLogger(__name__)
UUID_PATTERN = re.compile(r"^[\da-f]{8}-([\da-f]{4}-){3}[\da-f]{12}$", re.IGNORECASE)
SUPPORTED_PROPERTY_TYPES = {
"string",
"string[]",
"int",
"int[]",
"boolean",
"boolean[]",
"number",
"number[]",
"date",
"date[]",
"text",
"text[]",
}
class WeaviateDocumentStoreError(DocumentStoreError):
@ -391,13 +405,21 @@ class WeaviateDocumentStore(KeywordDocumentStore):
def _get_current_properties(self, index: Optional[str] = None) -> List[str]:
"""
Get all the existing properties in the schema.
Get all the existing properties in the schema, excluding those with complex types
like cross-reference
"""
index = self._sanitize_index_name(index) or self.index
cur_properties = []
for class_item in self.weaviate_client.schema.get()["classes"]:
if class_item["class"] == index:
cur_properties = [item["name"] for item in class_item["properties"]]
cur_properties = [
item["name"]
for item in class_item.get("properties", [])
# dataType should be always there and contain only one item unless
# it's a cross-reference but here we try to be defensive against
# unexpected schemas
if set(item.get("dataType", [])).issubset(SUPPORTED_PROPERTY_TYPES)
]
return cur_properties

View File

@ -1,16 +1,14 @@
import uuid
import json
from unittest import mock
import pytest
import numpy as np
from haystack.document_stores.weaviate import WeaviateDocumentStore
from haystack.schema import Document
from haystack.testing import DocumentStoreBaseTestAbstract
import uuid
from unittest.mock import MagicMock
import numpy as np
import pytest
from haystack.schema import Document
embedding_dim = 768
@ -207,7 +205,7 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract):
ds.write_documents(documents)
# This test verifies that deleting an object by its ID does not first require fetching all documents. This fixes
# a bug, as described in https://github.com/deepset-ai/haystack/issues/2898
ds.get_all_documents = MagicMock(wraps=ds.get_all_documents)
ds.get_all_documents = mock.MagicMock(wraps=ds.get_all_documents)
assert ds.get_document_count() == 9
@ -258,3 +256,30 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract):
"""
ds.write_documents(documents)
assert ds.get_embedding_count() == 9
@pytest.mark.unit
def test__get_current_properties(self):
with mock.patch("haystack.document_stores.weaviate.client") as mocked_client:
mocked_client.Client().is_ready.return_value = True
mocked_client.Client().schema.contains.return_value = False
mocked_client.Client().schema.get.return_value = json.loads(
"""
{
"classes": [{
"class": "Document",
"properties": [
{
"name": "hasWritten",
"dataType": ["Article"]
},
{
"name": "hitCounter",
"dataType": ["int"]
}
]
}]
} """
)
ds = WeaviateDocumentStore()
# Ensure we dropped the cross-reference property
assert ds._get_current_properties() == ["hitCounter"]