mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 17:59:27 +00:00 
			
		
		
		
	Ignore cross-reference properties when loading documents (#4664)
* drop cross-reference properties * be more defensive * fix regression
This commit is contained in:
		
							parent
							
								
									dbe3049682
								
							
						
					
					
						commit
						a03e8335aa
					
				| @ -28,6 +28,20 @@ from haystack.nodes.retriever import DenseRetriever | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| UUID_PATTERN = re.compile(r"^[\da-f]{8}-([\da-f]{4}-){3}[\da-f]{12}$", re.IGNORECASE) | ||||
| SUPPORTED_PROPERTY_TYPES = { | ||||
|     "string", | ||||
|     "string[]", | ||||
|     "int", | ||||
|     "int[]", | ||||
|     "boolean", | ||||
|     "boolean[]", | ||||
|     "number", | ||||
|     "number[]", | ||||
|     "date", | ||||
|     "date[]", | ||||
|     "text", | ||||
|     "text[]", | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| class WeaviateDocumentStoreError(DocumentStoreError): | ||||
| @ -391,13 +405,21 @@ class WeaviateDocumentStore(KeywordDocumentStore): | ||||
| 
 | ||||
|     def _get_current_properties(self, index: Optional[str] = None) -> List[str]: | ||||
|         """ | ||||
|         Get all the existing properties in the schema. | ||||
|         Get all the existing properties in the schema, excluding those with complex types | ||||
|         like cross-reference | ||||
|         """ | ||||
|         index = self._sanitize_index_name(index) or self.index | ||||
|         cur_properties = [] | ||||
|         for class_item in self.weaviate_client.schema.get()["classes"]: | ||||
|             if class_item["class"] == index: | ||||
|                 cur_properties = [item["name"] for item in class_item["properties"]] | ||||
|                 cur_properties = [ | ||||
|                     item["name"] | ||||
|                     for item in class_item.get("properties", []) | ||||
|                     # dataType should be always there and contain only one item unless | ||||
|                     # it's a cross-reference but here we try to be defensive against | ||||
|                     # unexpected schemas | ||||
|                     if set(item.get("dataType", [])).issubset(SUPPORTED_PROPERTY_TYPES) | ||||
|                 ] | ||||
| 
 | ||||
|         return cur_properties | ||||
| 
 | ||||
|  | ||||
| @ -1,16 +1,14 @@ | ||||
| import uuid | ||||
| import json | ||||
| from unittest import mock | ||||
| 
 | ||||
| import pytest | ||||
| import numpy as np | ||||
| 
 | ||||
| from haystack.document_stores.weaviate import WeaviateDocumentStore | ||||
| from haystack.schema import Document | ||||
| from haystack.testing import DocumentStoreBaseTestAbstract | ||||
| 
 | ||||
| import uuid | ||||
| from unittest.mock import MagicMock | ||||
| 
 | ||||
| import numpy as np | ||||
| import pytest | ||||
| 
 | ||||
| from haystack.schema import Document | ||||
| 
 | ||||
| embedding_dim = 768 | ||||
| 
 | ||||
| @ -207,7 +205,7 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract): | ||||
|         ds.write_documents(documents) | ||||
|         # This test verifies that deleting an object by its ID does not first require fetching all documents. This fixes | ||||
|         # a bug, as described in https://github.com/deepset-ai/haystack/issues/2898 | ||||
|         ds.get_all_documents = MagicMock(wraps=ds.get_all_documents) | ||||
|         ds.get_all_documents = mock.MagicMock(wraps=ds.get_all_documents) | ||||
| 
 | ||||
|         assert ds.get_document_count() == 9 | ||||
| 
 | ||||
| @ -258,3 +256,30 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract): | ||||
|         """ | ||||
|         ds.write_documents(documents) | ||||
|         assert ds.get_embedding_count() == 9 | ||||
| 
 | ||||
|     @pytest.mark.unit | ||||
|     def test__get_current_properties(self): | ||||
|         with mock.patch("haystack.document_stores.weaviate.client") as mocked_client: | ||||
|             mocked_client.Client().is_ready.return_value = True | ||||
|             mocked_client.Client().schema.contains.return_value = False | ||||
|             mocked_client.Client().schema.get.return_value = json.loads( | ||||
|                 """ | ||||
| { | ||||
|   "classes": [{ | ||||
|     "class": "Document", | ||||
|     "properties": [ | ||||
|         { | ||||
|         "name": "hasWritten", | ||||
|         "dataType": ["Article"] | ||||
|         }, | ||||
|         { | ||||
|         "name": "hitCounter", | ||||
|         "dataType": ["int"] | ||||
|         } | ||||
|     ] | ||||
|   }] | ||||
| } """ | ||||
|             ) | ||||
|             ds = WeaviateDocumentStore() | ||||
|             # Ensure we dropped the cross-reference property | ||||
|             assert ds._get_current_properties() == ["hitCounter"] | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Massimiliano Pippi
						Massimiliano Pippi