mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-01 12:23:31 +00:00
Ignore cross-reference properties when loading documents (#4664)
* drop cross-reference properties * be more defensive * fix regression
This commit is contained in:
parent
dbe3049682
commit
a03e8335aa
@ -28,6 +28,20 @@ from haystack.nodes.retriever import DenseRetriever
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
UUID_PATTERN = re.compile(r"^[\da-f]{8}-([\da-f]{4}-){3}[\da-f]{12}$", re.IGNORECASE)
|
UUID_PATTERN = re.compile(r"^[\da-f]{8}-([\da-f]{4}-){3}[\da-f]{12}$", re.IGNORECASE)
|
||||||
|
SUPPORTED_PROPERTY_TYPES = {
|
||||||
|
"string",
|
||||||
|
"string[]",
|
||||||
|
"int",
|
||||||
|
"int[]",
|
||||||
|
"boolean",
|
||||||
|
"boolean[]",
|
||||||
|
"number",
|
||||||
|
"number[]",
|
||||||
|
"date",
|
||||||
|
"date[]",
|
||||||
|
"text",
|
||||||
|
"text[]",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class WeaviateDocumentStoreError(DocumentStoreError):
|
class WeaviateDocumentStoreError(DocumentStoreError):
|
||||||
@ -391,13 +405,21 @@ class WeaviateDocumentStore(KeywordDocumentStore):
|
|||||||
|
|
||||||
def _get_current_properties(self, index: Optional[str] = None) -> List[str]:
|
def _get_current_properties(self, index: Optional[str] = None) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Get all the existing properties in the schema.
|
Get all the existing properties in the schema, excluding those with complex types
|
||||||
|
like cross-reference
|
||||||
"""
|
"""
|
||||||
index = self._sanitize_index_name(index) or self.index
|
index = self._sanitize_index_name(index) or self.index
|
||||||
cur_properties = []
|
cur_properties = []
|
||||||
for class_item in self.weaviate_client.schema.get()["classes"]:
|
for class_item in self.weaviate_client.schema.get()["classes"]:
|
||||||
if class_item["class"] == index:
|
if class_item["class"] == index:
|
||||||
cur_properties = [item["name"] for item in class_item["properties"]]
|
cur_properties = [
|
||||||
|
item["name"]
|
||||||
|
for item in class_item.get("properties", [])
|
||||||
|
# dataType should be always there and contain only one item unless
|
||||||
|
# it's a cross-reference but here we try to be defensive against
|
||||||
|
# unexpected schemas
|
||||||
|
if set(item.get("dataType", [])).issubset(SUPPORTED_PROPERTY_TYPES)
|
||||||
|
]
|
||||||
|
|
||||||
return cur_properties
|
return cur_properties
|
||||||
|
|
||||||
|
@ -1,16 +1,14 @@
|
|||||||
|
import uuid
|
||||||
|
import json
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from haystack.document_stores.weaviate import WeaviateDocumentStore
|
from haystack.document_stores.weaviate import WeaviateDocumentStore
|
||||||
from haystack.schema import Document
|
from haystack.schema import Document
|
||||||
from haystack.testing import DocumentStoreBaseTestAbstract
|
from haystack.testing import DocumentStoreBaseTestAbstract
|
||||||
|
|
||||||
import uuid
|
|
||||||
from unittest.mock import MagicMock
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from haystack.schema import Document
|
|
||||||
|
|
||||||
embedding_dim = 768
|
embedding_dim = 768
|
||||||
|
|
||||||
@ -207,7 +205,7 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract):
|
|||||||
ds.write_documents(documents)
|
ds.write_documents(documents)
|
||||||
# This test verifies that deleting an object by its ID does not first require fetching all documents. This fixes
|
# This test verifies that deleting an object by its ID does not first require fetching all documents. This fixes
|
||||||
# a bug, as described in https://github.com/deepset-ai/haystack/issues/2898
|
# a bug, as described in https://github.com/deepset-ai/haystack/issues/2898
|
||||||
ds.get_all_documents = MagicMock(wraps=ds.get_all_documents)
|
ds.get_all_documents = mock.MagicMock(wraps=ds.get_all_documents)
|
||||||
|
|
||||||
assert ds.get_document_count() == 9
|
assert ds.get_document_count() == 9
|
||||||
|
|
||||||
@ -258,3 +256,30 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract):
|
|||||||
"""
|
"""
|
||||||
ds.write_documents(documents)
|
ds.write_documents(documents)
|
||||||
assert ds.get_embedding_count() == 9
|
assert ds.get_embedding_count() == 9
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test__get_current_properties(self):
|
||||||
|
with mock.patch("haystack.document_stores.weaviate.client") as mocked_client:
|
||||||
|
mocked_client.Client().is_ready.return_value = True
|
||||||
|
mocked_client.Client().schema.contains.return_value = False
|
||||||
|
mocked_client.Client().schema.get.return_value = json.loads(
|
||||||
|
"""
|
||||||
|
{
|
||||||
|
"classes": [{
|
||||||
|
"class": "Document",
|
||||||
|
"properties": [
|
||||||
|
{
|
||||||
|
"name": "hasWritten",
|
||||||
|
"dataType": ["Article"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "hitCounter",
|
||||||
|
"dataType": ["int"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}]
|
||||||
|
} """
|
||||||
|
)
|
||||||
|
ds = WeaviateDocumentStore()
|
||||||
|
# Ensure we dropped the cross-reference property
|
||||||
|
assert ds._get_current_properties() == ["hitCounter"]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user