Standardize delete_documents(filter=...) across all document stores (#1509)

* Make InMemoryDocumentStore accept and apply filters in delete_documents()

* Modify test_document_store.py to test the filtered deletion in memory, sql and milvus too

* Make FAISSDocumentStore accept and properly apply filters in delete_documents()

* Add latest docstring and tutorial changes

* Remove accidentally duplicated test

* Remove unnecessary decorators from test/test_document_store.py::test_delete_documents_with_filters

* Add embeddings count test for FAISS and Milvus; Milvus fails it.

* Fixed a bug that made Milvus not deleting embeddings

* Remove batch size parametrization in tests & update all documentstore's docstrings with a filter example

* Add latest docstring and tutorial changes

Co-authored-by: prafgup <prafulgupta6@gmail.com>
This commit is contained in:
Sara Zan 2021-09-29 09:27:06 +02:00 committed by GitHub
parent 39d324ed17
commit a30a826c6c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 81 additions and 23 deletions

View File

@ -464,6 +464,7 @@ Delete documents in an index. All documents are deleted if no filters are passed
- `index`: Index name to delete the document from.
- `filters`: Optional filters to narrow down the documents to be deleted.
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
**Returns**:
@ -742,8 +743,10 @@ Delete documents in an index. All documents are deleted if no filters are passed
**Arguments**:
- `index`: Index name to delete the document from.
- `index`: Index name to delete the document from. If None, the
DocumentStore's default index (self.index) will be used.
- `filters`: Optional filters to narrow down the documents to be deleted.
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
**Returns**:
@ -956,8 +959,10 @@ Delete documents in an index. All documents are deleted if no filters are passed
**Arguments**:
- `index`: Index name to delete the document from.
- `index`: Index name to delete the document from. If None, the
DocumentStore's default index (self.index) will be used.
- `filters`: Optional filters to narrow down the documents to be deleted.
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
**Returns**:
@ -1149,7 +1154,18 @@ Delete all documents from the document store.
| delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
```
Delete all documents from the document store.
Delete documents from the document store. All documents are deleted if no filters are passed.
**Arguments**:
- `index`: Index name to delete the document from. If None, the
DocumentStore's default index (self.index) will be used.
- `filters`: Optional filters to narrow down the documents to be deleted.
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
**Returns**:
None
<a name="faiss.FAISSDocumentStore.query_by_embedding"></a>
#### query\_by\_embedding
@ -1403,11 +1419,12 @@ None
| delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
```
Delete all documents (from SQL AND Milvus).
Delete documents in an index. All documents are deleted if no filters are passed.
**Arguments**:
- `index`: (SQL) index name for storing the docs and metadata
- `index`: Index name to delete the document from. If None, the
DocumentStore's default index (self.index) will be used.
- `filters`: Optional filters to narrow down the search space.
Example: {"name": ["some", "more"], "category": ["only_one"]}
@ -1776,8 +1793,10 @@ Delete documents in an index. All documents are deleted if no filters are passed
**Arguments**:
- `index`: Index name to delete the document from.
- `index`: Index name to delete the document from. If None, the
DocumentStore's default index (self.index) will be used.
- `filters`: Optional filters to narrow down the documents to be deleted.
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
**Returns**:

View File

@ -1001,6 +1001,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
:param index: Index name to delete the document from.
:param filters: Optional filters to narrow down the documents to be deleted.
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
:return: None
"""
index = index or self.index

View File

@ -398,14 +398,23 @@ class FAISSDocumentStore(SQLDocumentStore):
def delete_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
"""
Delete all documents from the document store.
Delete documents from the document store. All documents are deleted if no filters are passed.
:param index: Index name to delete the document from. If None, the
DocumentStore's default index (self.index) will be used.
:param filters: Optional filters to narrow down the documents to be deleted.
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
:return: None
"""
if filters:
logger.warning("Filters are not supported for deleting documents in FAISSDocumentStore.")
index = index or self.index
if index in self.faiss_indexes.keys():
self.faiss_indexes[index].reset()
super().delete_documents(index=index)
if filters:
affected_docs = self.get_all_documents(filters=filters)
doc_ids = [doc.meta.get("vector_id") for doc in affected_docs if doc.meta and doc.meta.get("vector_id") is not None]
self.faiss_indexes[index].remove_ids(np.array(doc_ids, dtype="int64"))
else:
self.faiss_indexes[index].reset()
super().delete_documents(index=index, filters=filters)
def query_by_embedding(
self,

View File

@ -394,11 +394,15 @@ class InMemoryDocumentStore(BaseDocumentStore):
"""
Delete documents in an index. All documents are deleted if no filters are passed.
:param index: Index name to delete the document from.
:param index: Index name to delete the document from. If None, the
DocumentStore's default index (self.index) will be used.
:param filters: Optional filters to narrow down the documents to be deleted.
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
:return: None
"""
if filters:
raise NotImplementedError("Delete by filters is not implemented for InMemoryDocumentStore.")
index = index or self.index
self.indexes[index] = {}
if not filters:
self.indexes[index] = {}
return
for doc in self.get_all_documents(filters=filters):
del self.indexes[index][doc.id]

View File

@ -391,14 +391,15 @@ class MilvusDocumentStore(SQLDocumentStore):
def delete_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
"""
Delete all documents (from SQL AND Milvus).
:param index: (SQL) index name for storing the docs and metadata
Delete documents in an index. All documents are deleted if no filters are passed.
:param index: Index name to delete the document from. If None, the
DocumentStore's default index (self.index) will be used.
:param filters: Optional filters to narrow down the search space.
Example: {"name": ["some", "more"], "category": ["only_one"]}
:return: None
"""
index = index or self.index
super().delete_documents(index=index, filters=filters)
status, ok = self.milvus_server.has_collection(collection_name=index)
if status.code != Status.SUCCESS:
raise RuntimeError(f'Milvus has collection check failed: {status}')
@ -414,6 +415,9 @@ class MilvusDocumentStore(SQLDocumentStore):
self.milvus_server.flush([index])
self.milvus_server.compact(collection_name=index)
# Delete from SQL at the end to allow the above .get_all_documents() to work properly
super().delete_documents(index=index, filters=filters)
def get_all_documents_generator(
self,
index: Optional[str] = None,
@ -533,7 +537,7 @@ class MilvusDocumentStore(SQLDocumentStore):
id_array=existing_vector_ids
)
if status.code != Status.SUCCESS:
raise RuntimeError("E existing vector ids deletion failed: {status}")
raise RuntimeError(f"Existing vector ids deletion failed: {status}")
def get_all_vectors(self, index: Optional[str] = None) -> List[np.ndarray]:
"""

View File

@ -479,8 +479,10 @@ class SQLDocumentStore(BaseDocumentStore):
"""
Delete documents in an index. All documents are deleted if no filters are passed.
:param index: Index name to delete the document from.
:param index: Index name to delete the document from. If None, the
DocumentStore's default index (self.index) will be used.
:param filters: Optional filters to narrow down the documents to be deleted.
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
:return: None
"""
index = index or self.index

View File

@ -697,8 +697,10 @@ class WeaviateDocumentStore(BaseDocumentStore):
"""
Delete documents in an index. All documents are deleted if no filters are passed.
:param index: Index name to delete the document from.
:param index: Index name to delete the document from. If None, the
DocumentStore's default index (self.index) will be used.
:param filters: Optional filters to narrow down the documents to be deleted.
Example filters: {"name": ["some", "more"], "category": ["only_one"]}
:return: None
"""
index = index or self.index

View File

@ -324,8 +324,7 @@ def test_delete_documents(document_store_with_docs):
documents = document_store_with_docs.get_all_documents()
assert len(documents) == 0
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
def test_delete_documents_with_filters(document_store_with_docs):
document_store_with_docs.delete_documents(filters={"meta_field": ["test1", "test2"]})
documents = document_store_with_docs.get_all_documents()

View File

@ -1,3 +1,4 @@
import time
import faiss
import math
import numpy as np
@ -94,6 +95,7 @@ def test_faiss_write_docs(document_store, index_buffer_size, batch_size):
stored_emb = document_store.faiss_indexes[document_store.index].reconstruct(int(doc.meta["vector_id"]))
# compare original input vec with stored one (ignore extra dim added by hnsw)
assert np.allclose(original_doc["embedding"], stored_emb, rtol=0.01)
@pytest.mark.slow
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@ -192,6 +194,22 @@ def test_finding(document_store, retriever):
assert len(prediction.get('documents', [])) == 1
@pytest.mark.slow
@pytest.mark.parametrize("retriever", ["dpr"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True)
def test_delete_docs_with_filters(document_store, retriever):
document_store.write_documents(DOCUMENTS)
document_store.update_embeddings(retriever=retriever, batch_size=4)
assert document_store.get_embedding_count() == 6
document_store.delete_documents(filters={"name": ["name_1", "name_2", "name_3", "name_4"]})
documents = document_store.get_all_documents()
assert len(documents) == 2
assert document_store.get_embedding_count() == 2
assert {doc.meta["name"] for doc in documents} == {"name_5", "name_6"}
@pytest.mark.parametrize("retriever", ["embedding"], indirect=True)
@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True)
def test_pipeline(document_store, retriever):