mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-01 02:09:39 +00:00
Bug fix Weaviate document deletion (#2899)
* Bug fix Weaviate document deletion If no filters param is passed in, then the original code retrieves *all* documents before then deleting by their IDs. There's no need for that, since we can delete by their IDs directly. * Edit comment to clarify deletion and recreation * Write unit tests for bug fix
This commit is contained in:
parent
434b1c3682
commit
6b7d4a0514
@ -1299,10 +1299,19 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
index = self._sanitize_index_name(index) or self.index
|
||||
|
||||
if not filters and not ids:
|
||||
# Delete the existing index, then create an empty new one
|
||||
self._create_schema_and_index(index, recreate_index=True)
|
||||
return
|
||||
|
||||
# Create index if it doesn't exist yet
|
||||
self._create_schema_and_index(index, recreate_index=False)
|
||||
|
||||
if ids and not filters:
|
||||
for id in ids:
|
||||
self.weaviate_client.data_object.delete(id)
|
||||
|
||||
else:
|
||||
# create index if it doesn't exist yet
|
||||
self._create_schema_and_index(index, recreate_index=False)
|
||||
# Use filters to restrict list of retrieved documents, before checking these against provided ids
|
||||
docs_to_delete = self.get_all_documents(index, filters=filters)
|
||||
if ids:
|
||||
docs_to_delete = [doc for doc in docs_to_delete if doc.id in ids]
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import uuid
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
@ -6,7 +7,6 @@ import pytest
|
||||
from haystack.schema import Document
|
||||
from ..conftest import get_document_store
|
||||
|
||||
|
||||
embedding_dim = 768
|
||||
|
||||
|
||||
@ -123,3 +123,24 @@ def test_get_all_documents_unaffected_by_QUERY_MAXIMUM_RESULTS(document_store_wi
|
||||
monkeypatch.setattr(document_store_with_docs, "get_document_count", lambda **kwargs: 13_000)
|
||||
docs = document_store_with_docs.get_all_documents()
|
||||
assert len(docs) == 3
|
||||
|
||||
|
||||
@pytest.mark.weaviate
|
||||
@pytest.mark.parametrize("document_store_with_docs", ["weaviate"], indirect=True)
|
||||
def test_deleting_by_id_or_by_filters(document_store_with_docs):
|
||||
# This test verifies that deleting an object by its ID does not first require fetching all documents. This fixes
|
||||
# a bug, as described in https://github.com/deepset-ai/haystack/issues/2898
|
||||
document_store_with_docs.get_all_documents = MagicMock(wraps=document_store_with_docs.get_all_documents)
|
||||
|
||||
assert document_store_with_docs.get_document_count() == 3
|
||||
|
||||
# Delete a document by its ID. This should bypass the get_all_documents() call
|
||||
document_store_with_docs.delete_documents(ids=[DOCUMENTS_XS[0]["id"]])
|
||||
document_store_with_docs.get_all_documents.assert_not_called()
|
||||
assert document_store_with_docs.get_document_count() == 2
|
||||
|
||||
document_store_with_docs.get_all_documents.reset_mock()
|
||||
# Delete a document with filters. Prove that using the filters will go through get_all_documents()
|
||||
document_store_with_docs.delete_documents(filters={"name": ["filename2"]})
|
||||
document_store_with_docs.get_all_documents.assert_called()
|
||||
assert document_store_with_docs.get_document_count() == 1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user