Ignore cross-reference properties when loading documents (#4664)

* drop cross-reference properties

* be more defensive

* fix regression
This commit is contained in:
Massimiliano Pippi 2023-04-17 10:40:30 +02:00 committed by GitHub
parent dbe3049682
commit a03e8335aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 57 additions and 10 deletions

View File

@ -28,6 +28,20 @@ from haystack.nodes.retriever import DenseRetriever
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
UUID_PATTERN = re.compile(r"^[\da-f]{8}-([\da-f]{4}-){3}[\da-f]{12}$", re.IGNORECASE) UUID_PATTERN = re.compile(r"^[\da-f]{8}-([\da-f]{4}-){3}[\da-f]{12}$", re.IGNORECASE)
SUPPORTED_PROPERTY_TYPES = {
"string",
"string[]",
"int",
"int[]",
"boolean",
"boolean[]",
"number",
"number[]",
"date",
"date[]",
"text",
"text[]",
}
class WeaviateDocumentStoreError(DocumentStoreError): class WeaviateDocumentStoreError(DocumentStoreError):
@ -391,13 +405,21 @@ class WeaviateDocumentStore(KeywordDocumentStore):
def _get_current_properties(self, index: Optional[str] = None) -> List[str]: def _get_current_properties(self, index: Optional[str] = None) -> List[str]:
""" """
Get all the existing properties in the schema. Get all the existing properties in the schema, excluding those with complex types
like cross-reference
""" """
index = self._sanitize_index_name(index) or self.index index = self._sanitize_index_name(index) or self.index
cur_properties = [] cur_properties = []
for class_item in self.weaviate_client.schema.get()["classes"]: for class_item in self.weaviate_client.schema.get()["classes"]:
if class_item["class"] == index: if class_item["class"] == index:
cur_properties = [item["name"] for item in class_item["properties"]] cur_properties = [
item["name"]
for item in class_item.get("properties", [])
# dataType should be always there and contain only one item unless
# it's a cross-reference but here we try to be defensive against
# unexpected schemas
if set(item.get("dataType", [])).issubset(SUPPORTED_PROPERTY_TYPES)
]
return cur_properties return cur_properties

View File

@ -1,16 +1,14 @@
import uuid
import json
from unittest import mock
import pytest import pytest
import numpy as np
from haystack.document_stores.weaviate import WeaviateDocumentStore from haystack.document_stores.weaviate import WeaviateDocumentStore
from haystack.schema import Document from haystack.schema import Document
from haystack.testing import DocumentStoreBaseTestAbstract from haystack.testing import DocumentStoreBaseTestAbstract
import uuid
from unittest.mock import MagicMock
import numpy as np
import pytest
from haystack.schema import Document
embedding_dim = 768 embedding_dim = 768
@ -207,7 +205,7 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract):
ds.write_documents(documents) ds.write_documents(documents)
# This test verifies that deleting an object by its ID does not first require fetching all documents. This fixes # This test verifies that deleting an object by its ID does not first require fetching all documents. This fixes
# a bug, as described in https://github.com/deepset-ai/haystack/issues/2898 # a bug, as described in https://github.com/deepset-ai/haystack/issues/2898
ds.get_all_documents = MagicMock(wraps=ds.get_all_documents) ds.get_all_documents = mock.MagicMock(wraps=ds.get_all_documents)
assert ds.get_document_count() == 9 assert ds.get_document_count() == 9
@ -258,3 +256,30 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract):
""" """
ds.write_documents(documents) ds.write_documents(documents)
assert ds.get_embedding_count() == 9 assert ds.get_embedding_count() == 9
@pytest.mark.unit
def test__get_current_properties(self):
with mock.patch("haystack.document_stores.weaviate.client") as mocked_client:
mocked_client.Client().is_ready.return_value = True
mocked_client.Client().schema.contains.return_value = False
mocked_client.Client().schema.get.return_value = json.loads(
"""
{
"classes": [{
"class": "Document",
"properties": [
{
"name": "hasWritten",
"dataType": ["Article"]
},
{
"name": "hitCounter",
"dataType": ["int"]
}
]
}]
} """
)
ds = WeaviateDocumentStore()
# Ensure we dropped the cross-reference property
assert ds._get_current_properties() == ["hitCounter"]