mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-31 03:46:30 +00:00
Ignore cross-reference properties when loading documents (#4664)
* drop cross-reference properties * be more defensive * fix regression
This commit is contained in:
parent
dbe3049682
commit
a03e8335aa
@ -28,6 +28,20 @@ from haystack.nodes.retriever import DenseRetriever
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
UUID_PATTERN = re.compile(r"^[\da-f]{8}-([\da-f]{4}-){3}[\da-f]{12}$", re.IGNORECASE)
|
||||
SUPPORTED_PROPERTY_TYPES = {
|
||||
"string",
|
||||
"string[]",
|
||||
"int",
|
||||
"int[]",
|
||||
"boolean",
|
||||
"boolean[]",
|
||||
"number",
|
||||
"number[]",
|
||||
"date",
|
||||
"date[]",
|
||||
"text",
|
||||
"text[]",
|
||||
}
|
||||
|
||||
|
||||
class WeaviateDocumentStoreError(DocumentStoreError):
|
||||
@ -391,13 +405,21 @@ class WeaviateDocumentStore(KeywordDocumentStore):
|
||||
|
||||
def _get_current_properties(self, index: Optional[str] = None) -> List[str]:
|
||||
"""
|
||||
Get all the existing properties in the schema.
|
||||
Get all the existing properties in the schema, excluding those with complex types
|
||||
like cross-reference
|
||||
"""
|
||||
index = self._sanitize_index_name(index) or self.index
|
||||
cur_properties = []
|
||||
for class_item in self.weaviate_client.schema.get()["classes"]:
|
||||
if class_item["class"] == index:
|
||||
cur_properties = [item["name"] for item in class_item["properties"]]
|
||||
cur_properties = [
|
||||
item["name"]
|
||||
for item in class_item.get("properties", [])
|
||||
# dataType should be always there and contain only one item unless
|
||||
# it's a cross-reference but here we try to be defensive against
|
||||
# unexpected schemas
|
||||
if set(item.get("dataType", [])).issubset(SUPPORTED_PROPERTY_TYPES)
|
||||
]
|
||||
|
||||
return cur_properties
|
||||
|
||||
|
@ -1,16 +1,14 @@
|
||||
import uuid
|
||||
import json
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
|
||||
from haystack.document_stores.weaviate import WeaviateDocumentStore
|
||||
from haystack.schema import Document
|
||||
from haystack.testing import DocumentStoreBaseTestAbstract
|
||||
|
||||
import uuid
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from haystack.schema import Document
|
||||
|
||||
embedding_dim = 768
|
||||
|
||||
@ -207,7 +205,7 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract):
|
||||
ds.write_documents(documents)
|
||||
# This test verifies that deleting an object by its ID does not first require fetching all documents. This fixes
|
||||
# a bug, as described in https://github.com/deepset-ai/haystack/issues/2898
|
||||
ds.get_all_documents = MagicMock(wraps=ds.get_all_documents)
|
||||
ds.get_all_documents = mock.MagicMock(wraps=ds.get_all_documents)
|
||||
|
||||
assert ds.get_document_count() == 9
|
||||
|
||||
@ -258,3 +256,30 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract):
|
||||
"""
|
||||
ds.write_documents(documents)
|
||||
assert ds.get_embedding_count() == 9
|
||||
|
||||
@pytest.mark.unit
|
||||
def test__get_current_properties(self):
|
||||
with mock.patch("haystack.document_stores.weaviate.client") as mocked_client:
|
||||
mocked_client.Client().is_ready.return_value = True
|
||||
mocked_client.Client().schema.contains.return_value = False
|
||||
mocked_client.Client().schema.get.return_value = json.loads(
|
||||
"""
|
||||
{
|
||||
"classes": [{
|
||||
"class": "Document",
|
||||
"properties": [
|
||||
{
|
||||
"name": "hasWritten",
|
||||
"dataType": ["Article"]
|
||||
},
|
||||
{
|
||||
"name": "hitCounter",
|
||||
"dataType": ["int"]
|
||||
}
|
||||
]
|
||||
}]
|
||||
} """
|
||||
)
|
||||
ds = WeaviateDocumentStore()
|
||||
# Ensure we dropped the cross-reference property
|
||||
assert ds._get_current_properties() == ["hitCounter"]
|
||||
|
Loading…
x
Reference in New Issue
Block a user