Bug fix Weaviate document deletion (#2899)

* Bug fix Weaviate document deletion

If no filters param is passed in, then the original code retrieves *all* documents before then deleting by their IDs. There's no need for that, since we can delete by their IDs directly.

* Edit comment to clarify deletion and recreation

* Write unit tests for bug fix
This commit is contained in:
Steven Haley 2022-07-29 16:21:25 +01:00 committed by GitHub
parent 434b1c3682
commit 6b7d4a0514
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 33 additions and 3 deletions

View File

@ -1299,10 +1299,19 @@ class WeaviateDocumentStore(BaseDocumentStore):
index = self._sanitize_index_name(index) or self.index
if not filters and not ids:
# Delete the existing index, then create an empty new one
self._create_schema_and_index(index, recreate_index=True)
return
# Create index if it doesn't exist yet
self._create_schema_and_index(index, recreate_index=False)
if ids and not filters:
for id in ids:
self.weaviate_client.data_object.delete(id)
else:
# create index if it doesn't exist yet
self._create_schema_and_index(index, recreate_index=False)
# Use filters to restrict list of retrieved documents, before checking these against provided ids
docs_to_delete = self.get_all_documents(index, filters=filters)
if ids:
docs_to_delete = [doc for doc in docs_to_delete if doc.id in ids]

View File

@ -1,4 +1,5 @@
import uuid
from unittest.mock import MagicMock
import numpy as np
import pytest
@ -6,7 +7,6 @@ import pytest
from haystack.schema import Document
from ..conftest import get_document_store
embedding_dim = 768
@ -123,3 +123,24 @@ def test_get_all_documents_unaffected_by_QUERY_MAXIMUM_RESULTS(document_store_wi
monkeypatch.setattr(document_store_with_docs, "get_document_count", lambda **kwargs: 13_000)
docs = document_store_with_docs.get_all_documents()
assert len(docs) == 3
@pytest.mark.weaviate
@pytest.mark.parametrize("document_store_with_docs", ["weaviate"], indirect=True)
def test_deleting_by_id_or_by_filters(document_store_with_docs):
# This test verifies that deleting an object by its ID does not first require fetching all documents. This fixes
# a bug, as described in https://github.com/deepset-ai/haystack/issues/2898
document_store_with_docs.get_all_documents = MagicMock(wraps=document_store_with_docs.get_all_documents)
assert document_store_with_docs.get_document_count() == 3
# Delete a document by its ID. This should bypass the get_all_documents() call
document_store_with_docs.delete_documents(ids=[DOCUMENTS_XS[0]["id"]])
document_store_with_docs.get_all_documents.assert_not_called()
assert document_store_with_docs.get_document_count() == 2
document_store_with_docs.get_all_documents.reset_mock()
# Delete a document with filters. Prove that using the filters will go through get_all_documents()
document_store_with_docs.delete_documents(filters={"name": ["filename2"]})
document_store_with_docs.get_all_documents.assert_called()
assert document_store_with_docs.get_document_count() == 1