mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-15 17:43:55 +00:00
Bug fix Weaviate document deletion (#2899)
* Bug fix Weaviate document deletion If no filters param is passed in, then the original code retrieves *all* documents before then deleting by their IDs. There's no need for that, since we can delete by their IDs directly. * Edit comment to clarify deletion and recreation * Write unit tests for bug fix
This commit is contained in:
parent
434b1c3682
commit
6b7d4a0514
@ -1299,10 +1299,19 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
|||||||
index = self._sanitize_index_name(index) or self.index
|
index = self._sanitize_index_name(index) or self.index
|
||||||
|
|
||||||
if not filters and not ids:
|
if not filters and not ids:
|
||||||
|
# Delete the existing index, then create an empty new one
|
||||||
self._create_schema_and_index(index, recreate_index=True)
|
self._create_schema_and_index(index, recreate_index=True)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create index if it doesn't exist yet
|
||||||
|
self._create_schema_and_index(index, recreate_index=False)
|
||||||
|
|
||||||
|
if ids and not filters:
|
||||||
|
for id in ids:
|
||||||
|
self.weaviate_client.data_object.delete(id)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# create index if it doesn't exist yet
|
# Use filters to restrict list of retrieved documents, before checking these against provided ids
|
||||||
self._create_schema_and_index(index, recreate_index=False)
|
|
||||||
docs_to_delete = self.get_all_documents(index, filters=filters)
|
docs_to_delete = self.get_all_documents(index, filters=filters)
|
||||||
if ids:
|
if ids:
|
||||||
docs_to_delete = [doc for doc in docs_to_delete if doc.id in ids]
|
docs_to_delete = [doc for doc in docs_to_delete if doc.id in ids]
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
import uuid
|
import uuid
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
@ -6,7 +7,6 @@ import pytest
|
|||||||
from haystack.schema import Document
|
from haystack.schema import Document
|
||||||
from ..conftest import get_document_store
|
from ..conftest import get_document_store
|
||||||
|
|
||||||
|
|
||||||
embedding_dim = 768
|
embedding_dim = 768
|
||||||
|
|
||||||
|
|
||||||
@ -123,3 +123,24 @@ def test_get_all_documents_unaffected_by_QUERY_MAXIMUM_RESULTS(document_store_wi
|
|||||||
monkeypatch.setattr(document_store_with_docs, "get_document_count", lambda **kwargs: 13_000)
|
monkeypatch.setattr(document_store_with_docs, "get_document_count", lambda **kwargs: 13_000)
|
||||||
docs = document_store_with_docs.get_all_documents()
|
docs = document_store_with_docs.get_all_documents()
|
||||||
assert len(docs) == 3
|
assert len(docs) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.weaviate
|
||||||
|
@pytest.mark.parametrize("document_store_with_docs", ["weaviate"], indirect=True)
|
||||||
|
def test_deleting_by_id_or_by_filters(document_store_with_docs):
|
||||||
|
# This test verifies that deleting an object by its ID does not first require fetching all documents. This fixes
|
||||||
|
# a bug, as described in https://github.com/deepset-ai/haystack/issues/2898
|
||||||
|
document_store_with_docs.get_all_documents = MagicMock(wraps=document_store_with_docs.get_all_documents)
|
||||||
|
|
||||||
|
assert document_store_with_docs.get_document_count() == 3
|
||||||
|
|
||||||
|
# Delete a document by its ID. This should bypass the get_all_documents() call
|
||||||
|
document_store_with_docs.delete_documents(ids=[DOCUMENTS_XS[0]["id"]])
|
||||||
|
document_store_with_docs.get_all_documents.assert_not_called()
|
||||||
|
assert document_store_with_docs.get_document_count() == 2
|
||||||
|
|
||||||
|
document_store_with_docs.get_all_documents.reset_mock()
|
||||||
|
# Delete a document with filters. Prove that using the filters will go through get_all_documents()
|
||||||
|
document_store_with_docs.delete_documents(filters={"name": ["filename2"]})
|
||||||
|
document_store_with_docs.get_all_documents.assert_called()
|
||||||
|
assert document_store_with_docs.get_document_count() == 1
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user