mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-24 21:48:52 +00:00
Standardize delete_documents(filter=...) across all document stores (#1509)
* Make InMemoryDocumentStore accept and apply filters in delete_documents() * Modify test_document_store.py to test the filtered deletion in memory, sql and milvus too * Make FAISSDocumentStore accept and properly apply filters in delete_documents() * Add latest docstring and tutorial changes * Remove accidentally duplicated test * Remove unnecessary decorators from test/test_document_store.py::test_delete_documents_with_filters * Add embeddings count test for FAISS and Milvus; Milvus fails it. * Fixed a bug that made Milvus not deleting embeddings * Remove batch size parametrization in tests & update all documentstore's docstrings with a filter example * Add latest docstring and tutorial changes Co-authored-by: prafgup <prafulgupta6@gmail.com>
This commit is contained in:
parent
39d324ed17
commit
a30a826c6c
@ -464,6 +464,7 @@ Delete documents in an index. All documents are deleted if no filters are passed
|
||||
|
||||
- `index`: Index name to delete the document from.
|
||||
- `filters`: Optional filters to narrow down the documents to be deleted.
|
||||
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
|
||||
**Returns**:
|
||||
|
||||
@ -742,8 +743,10 @@ Delete documents in an index. All documents are deleted if no filters are passed
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Index name to delete the document from.
|
||||
- `index`: Index name to delete the document from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
- `filters`: Optional filters to narrow down the documents to be deleted.
|
||||
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
|
||||
**Returns**:
|
||||
|
||||
@ -956,8 +959,10 @@ Delete documents in an index. All documents are deleted if no filters are passed
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Index name to delete the document from.
|
||||
- `index`: Index name to delete the document from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
- `filters`: Optional filters to narrow down the documents to be deleted.
|
||||
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
|
||||
**Returns**:
|
||||
|
||||
@ -1149,7 +1154,18 @@ Delete all documents from the document store.
|
||||
| delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
|
||||
```
|
||||
|
||||
Delete all documents from the document store.
|
||||
Delete documents from the document store. All documents are deleted if no filters are passed.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Index name to delete the document from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
- `filters`: Optional filters to narrow down the documents to be deleted.
|
||||
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
|
||||
**Returns**:
|
||||
|
||||
None
|
||||
|
||||
<a name="faiss.FAISSDocumentStore.query_by_embedding"></a>
|
||||
#### query\_by\_embedding
|
||||
@ -1403,11 +1419,12 @@ None
|
||||
| delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
|
||||
```
|
||||
|
||||
Delete all documents (from SQL AND Milvus).
|
||||
Delete documents in an index. All documents are deleted if no filters are passed.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: (SQL) index name for storing the docs and metadata
|
||||
- `index`: Index name to delete the document from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
- `filters`: Optional filters to narrow down the search space.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
|
||||
@ -1776,8 +1793,10 @@ Delete documents in an index. All documents are deleted if no filters are passed
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Index name to delete the document from.
|
||||
- `index`: Index name to delete the document from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
- `filters`: Optional filters to narrow down the documents to be deleted.
|
||||
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
|
||||
**Returns**:
|
||||
|
||||
|
||||
@ -1001,6 +1001,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
|
||||
:param index: Index name to delete the document from.
|
||||
:param filters: Optional filters to narrow down the documents to be deleted.
|
||||
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:return: None
|
||||
"""
|
||||
index = index or self.index
|
||||
|
||||
@ -398,14 +398,23 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
|
||||
def delete_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
|
||||
"""
|
||||
Delete all documents from the document store.
|
||||
Delete documents from the document store. All documents are deleted if no filters are passed.
|
||||
|
||||
:param index: Index name to delete the document from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the documents to be deleted.
|
||||
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:return: None
|
||||
"""
|
||||
if filters:
|
||||
logger.warning("Filters are not supported for deleting documents in FAISSDocumentStore.")
|
||||
index = index or self.index
|
||||
if index in self.faiss_indexes.keys():
|
||||
self.faiss_indexes[index].reset()
|
||||
super().delete_documents(index=index)
|
||||
if filters:
|
||||
affected_docs = self.get_all_documents(filters=filters)
|
||||
doc_ids = [doc.meta.get("vector_id") for doc in affected_docs if doc.meta and doc.meta.get("vector_id") is not None]
|
||||
self.faiss_indexes[index].remove_ids(np.array(doc_ids, dtype="int64"))
|
||||
else:
|
||||
self.faiss_indexes[index].reset()
|
||||
super().delete_documents(index=index, filters=filters)
|
||||
|
||||
def query_by_embedding(
|
||||
self,
|
||||
|
||||
@ -394,11 +394,15 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
"""
|
||||
Delete documents in an index. All documents are deleted if no filters are passed.
|
||||
|
||||
:param index: Index name to delete the document from.
|
||||
:param index: Index name to delete the document from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the documents to be deleted.
|
||||
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:return: None
|
||||
"""
|
||||
if filters:
|
||||
raise NotImplementedError("Delete by filters is not implemented for InMemoryDocumentStore.")
|
||||
index = index or self.index
|
||||
self.indexes[index] = {}
|
||||
if not filters:
|
||||
self.indexes[index] = {}
|
||||
return
|
||||
for doc in self.get_all_documents(filters=filters):
|
||||
del self.indexes[index][doc.id]
|
||||
|
||||
@ -391,14 +391,15 @@ class MilvusDocumentStore(SQLDocumentStore):
|
||||
|
||||
def delete_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
|
||||
"""
|
||||
Delete all documents (from SQL AND Milvus).
|
||||
:param index: (SQL) index name for storing the docs and metadata
|
||||
Delete documents in an index. All documents are deleted if no filters are passed.
|
||||
|
||||
:param index: Index name to delete the document from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the search space.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:return: None
|
||||
"""
|
||||
index = index or self.index
|
||||
super().delete_documents(index=index, filters=filters)
|
||||
status, ok = self.milvus_server.has_collection(collection_name=index)
|
||||
if status.code != Status.SUCCESS:
|
||||
raise RuntimeError(f'Milvus has collection check failed: {status}')
|
||||
@ -414,6 +415,9 @@ class MilvusDocumentStore(SQLDocumentStore):
|
||||
self.milvus_server.flush([index])
|
||||
self.milvus_server.compact(collection_name=index)
|
||||
|
||||
# Delete from SQL at the end to allow the above .get_all_documents() to work properly
|
||||
super().delete_documents(index=index, filters=filters)
|
||||
|
||||
def get_all_documents_generator(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
@ -533,7 +537,7 @@ class MilvusDocumentStore(SQLDocumentStore):
|
||||
id_array=existing_vector_ids
|
||||
)
|
||||
if status.code != Status.SUCCESS:
|
||||
raise RuntimeError("E existing vector ids deletion failed: {status}")
|
||||
raise RuntimeError(f"Existing vector ids deletion failed: {status}")
|
||||
|
||||
def get_all_vectors(self, index: Optional[str] = None) -> List[np.ndarray]:
|
||||
"""
|
||||
|
||||
@ -479,8 +479,10 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
"""
|
||||
Delete documents in an index. All documents are deleted if no filters are passed.
|
||||
|
||||
:param index: Index name to delete the document from.
|
||||
:param index: Index name to delete the document from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the documents to be deleted.
|
||||
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:return: None
|
||||
"""
|
||||
index = index or self.index
|
||||
|
||||
@ -697,8 +697,10 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
"""
|
||||
Delete documents in an index. All documents are deleted if no filters are passed.
|
||||
|
||||
:param index: Index name to delete the document from.
|
||||
:param index: Index name to delete the document from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the documents to be deleted.
|
||||
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:return: None
|
||||
"""
|
||||
index = index or self.index
|
||||
|
||||
@ -324,8 +324,7 @@ def test_delete_documents(document_store_with_docs):
|
||||
documents = document_store_with_docs.get_all_documents()
|
||||
assert len(documents) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
||||
|
||||
def test_delete_documents_with_filters(document_store_with_docs):
|
||||
document_store_with_docs.delete_documents(filters={"meta_field": ["test1", "test2"]})
|
||||
documents = document_store_with_docs.get_all_documents()
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import time
|
||||
import faiss
|
||||
import math
|
||||
import numpy as np
|
||||
@ -94,6 +95,7 @@ def test_faiss_write_docs(document_store, index_buffer_size, batch_size):
|
||||
stored_emb = document_store.faiss_indexes[document_store.index].reconstruct(int(doc.meta["vector_id"]))
|
||||
# compare original input vec with stored one (ignore extra dim added by hnsw)
|
||||
assert np.allclose(original_doc["embedding"], stored_emb, rtol=0.01)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
||||
@ -192,6 +194,22 @@ def test_finding(document_store, retriever):
|
||||
assert len(prediction.get('documents', [])) == 1
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
|
||||
@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True)
|
||||
def test_delete_docs_with_filters(document_store, retriever):
|
||||
document_store.write_documents(DOCUMENTS)
|
||||
document_store.update_embeddings(retriever=retriever, batch_size=4)
|
||||
assert document_store.get_embedding_count() == 6
|
||||
|
||||
document_store.delete_documents(filters={"name": ["name_1", "name_2", "name_3", "name_4"]})
|
||||
|
||||
documents = document_store.get_all_documents()
|
||||
assert len(documents) == 2
|
||||
assert document_store.get_embedding_count() == 2
|
||||
assert {doc.meta["name"] for doc in documents} == {"name_5", "name_6"}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
|
||||
@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True)
|
||||
def test_pipeline(document_store, retriever):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user