diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md
index 908c05208..16d1a33e0 100644
--- a/docs/_src/api/api/document_store.md
+++ b/docs/_src/api/api/document_store.md
@@ -61,7 +61,7 @@ None
```python
@abstractmethod
-def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Get documents from the document store.
@@ -70,8 +70,32 @@ Get documents from the document store.
- `index`: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
-- `filters`: Optional filters to narrow down the documents to return.
-Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
- `return_embedding`: Whether to return the document embeddings.
- `batch_size`: Number of documents that are passed to bulk function at a time.
- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
@@ -82,7 +106,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]}
```python
@abstractmethod
-def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
```
Get documents from the document store. Under-the-hood, documents are fetched in batches from the
@@ -94,8 +118,32 @@ a large number of documents without having to load all documents in memory.
- `index`: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
-- `filters`: Optional filters to narrow down the documents to return.
-Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+__Example__:
+```python
+filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+}
+```
- `return_embedding`: Whether to return the document embeddings.
- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
@@ -105,7 +153,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]}
#### get\_all\_labels\_aggregated
```python
-def get_all_labels_aggregated(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, open_domain: bool = True, drop_negative_labels: bool = False, drop_no_answers: bool = False, aggregate_by_meta: Optional[Union[str, list]] = None, headers: Optional[Dict[str, str]] = None) -> List[MultiLabel]
+def get_all_labels_aggregated(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, open_domain: bool = True, drop_negative_labels: bool = False, drop_no_answers: bool = False, aggregate_by_meta: Optional[Union[str, list]] = None, headers: Optional[Dict[str, str]] = None) -> List[MultiLabel]
```
Return all labels in the DocumentStore, aggregated into MultiLabel objects.
@@ -125,8 +173,32 @@ object, provided that they have the same product_id (to be found in Label.meta["
- `index`: Name of the index to get the labels from. If None, the
DocumentStore's default index (self.index) will be used.
-- `filters`: Optional filters to narrow down the labels to return.
-Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
- `open_domain`: When True, labels are aggregated purely based on the question text alone.
When False, labels are aggregated in a closed domain fashion based on the question text
and also the id of the document that the label is tied to. In this setting, this function
@@ -216,7 +288,7 @@ Base class for implementing Document Stores that support keyword searches.
```python
@abstractmethod
-def query(query: Optional[str], filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@@ -226,7 +298,69 @@ that are most relevant to the query as defined by keyword matching algorithms li
**Arguments**:
- `query`: The query
-- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
- `top_k`: How many documents to return per query.
- `custom_query`: Custom query to be executed.
- `index`: The name of the index in the DocumentStore from which to retrieve documents
@@ -280,7 +414,7 @@ to performance issues. Note that Elasticsearch limits the number of results to 1
#### get\_metadata\_values\_by\_key
```python
-def get_metadata_values_by_key(key: str, query: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[dict]
+def get_metadata_values_by_key(key: str, query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[dict]
```
Get values associated with a metadata key. The output is in the format:
@@ -302,7 +436,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
-Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -399,7 +533,7 @@ Update the metadata dictionary of a document by specifying its string id
#### get\_document\_count
```python
-def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int
+def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int
```
Return the number of documents in the document store.
@@ -419,7 +553,7 @@ Return the number of labels in the document store
#### get\_embedding\_count
```python
-def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) -> int
+def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) -> int
```
Return the count of embeddings in the document store.
@@ -429,7 +563,7 @@ Return the count of embeddings in the document store.
#### get\_all\_documents
```python
-def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Get documents from the document store.
@@ -449,7 +583,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
-Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -473,7 +607,7 @@ Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-c
#### get\_all\_documents\_generator
```python
-def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
```
Get documents from the document store. Under-the-hood, documents are fetched in batches from the
@@ -496,7 +630,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
-Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -520,7 +654,7 @@ Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-c
#### get\_all\_labels
```python
-def get_all_labels(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) -> List[Label]
+def get_all_labels(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) -> List[Label]
```
Return all labels in the document store
@@ -530,7 +664,7 @@ Return all labels in the document store
#### query
```python
-def query(query: Optional[str], filters: Optional[Dict[str, Any]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@@ -552,7 +686,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
-Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -577,10 +711,10 @@ Example:
}
```
-To use the same logical operator multiple times on the same level, logical operators take
-optionally a list of dictionaries as value.
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
-Example:
+ __Example__:
```python
filters = {
"$or": [
@@ -677,7 +811,7 @@ Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-c
#### query\_by\_embedding
```python
-def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
@@ -697,7 +831,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
-Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -722,10 +856,10 @@ Example:
}
```
-To use the same logical operator multiple times on the same level, logical operators take
-optionally a list of dictionaries as value.
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
-Example:
+ __Example__:
```python
filters = {
"$or": [
@@ -769,7 +903,7 @@ Return a summary of the documents in the document store
#### update\_embeddings
```python
-def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None)
+def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None)
```
Updates the embeddings in the the document store using the encoding model specified in the retriever.
@@ -795,7 +929,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
-Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -822,7 +956,7 @@ None
#### delete\_all\_documents
```python
-def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete documents in an index. All documents are deleted if no filters are passed.
@@ -841,7 +975,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
-Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -867,7 +1001,7 @@ None
#### delete\_documents
```python
-def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete documents in an index. All documents are deleted if no filters are passed.
@@ -888,7 +1022,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
-Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -903,9 +1037,9 @@ Example:
}
```
-If filters are provided along with a list of IDs, this method deletes the
-intersection of the two query results (documents that match the filters and
-have their ID in the list).
+ If filters are provided along with a list of IDs, this method deletes the
+ intersection of the two query results (documents that match the filters and
+ have their ID in the list).
- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
@@ -918,7 +1052,7 @@ None
#### delete\_labels
```python
-def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete labels in an index. All labels are deleted if no filters are passed.
@@ -939,7 +1073,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
-Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -991,7 +1125,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore)
#### query\_by\_embedding
```python
-def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
@@ -1011,7 +1145,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
-Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -1036,10 +1170,10 @@ Example:
}
```
-To use the same logical operator multiple times on the same level, logical operators take
-optionally a list of dictionaries as value.
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
-Example:
+ __Example__:
```python
filters = {
"$or": [
@@ -1185,7 +1319,7 @@ Calculate similarity scores between query embedding and a list of documents usin
#### query\_by\_embedding
```python
-def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
@@ -1204,7 +1338,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]}
#### update\_embeddings
```python
-def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000)
+def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000)
```
Updates the embeddings in the the document store using the encoding model specified in the retriever.
@@ -1232,7 +1366,7 @@ None
#### get\_document\_count
```python
-def get_document_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int
+def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int
```
Return the number of documents in the document store.
@@ -1262,7 +1396,7 @@ Return the number of labels in the document store.
#### get\_all\_documents
```python
-def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Get all documents from the document store as a list.
@@ -1280,7 +1414,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]}
#### get\_all\_documents\_generator
```python
-def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
```
Get all documents from the document store. The methods returns a Python Generator that yields individual
@@ -1300,7 +1434,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]}
#### get\_all\_labels
```python
-def get_all_labels(index: str = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) -> List[Label]
+def get_all_labels(index: str = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) -> List[Label]
```
Return all labels in the document store.
@@ -1310,7 +1444,7 @@ Return all labels in the document store.
#### delete\_all\_documents
```python
-def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete documents in an index. All documents are deleted if no filters are passed.
@@ -1329,7 +1463,7 @@ None
#### delete\_documents
```python
-def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete documents in an index. All documents are deleted if no filters are passed.
@@ -1354,7 +1488,7 @@ None
#### delete\_labels
```python
-def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete labels in an index. All labels are deleted if no filters are passed.
@@ -1418,7 +1552,7 @@ Fetch documents by specifying a list of text vector id strings
#### get\_all\_documents\_generator
```python
-def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
```
Get documents from the document store. Under-the-hood, documents are fetched in batches from the
@@ -1527,7 +1661,7 @@ Update the metadata dictionary of a document by specifying its string id
#### get\_document\_count
```python
-def get_document_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int
+def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int
```
Return the number of documents in the document store.
@@ -1547,7 +1681,7 @@ Return the number of labels in the document store
#### delete\_all\_documents
```python
-def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete documents in an index. All documents are deleted if no filters are passed.
@@ -1566,7 +1700,7 @@ None
#### delete\_documents
```python
-def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete documents in an index. All documents are deleted if no filters are passed.
@@ -1591,7 +1725,7 @@ None
#### delete\_labels
```python
-def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete labels from the document store. All labels are deleted if no filters are passed.
@@ -1664,7 +1798,7 @@ None
#### update\_embeddings
```python
-def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, List[str]]] = None, batch_size: int = 10_000)
+def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None, batch_size: int = 10_000)
```
Updates the embeddings in the the document store using the encoding model specified in the retriever.
@@ -1692,7 +1826,7 @@ None
#### get\_all\_documents\_generator
```python
-def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
```
Get all documents from the document store. Under-the-hood, documents are fetched in batches from the
@@ -1714,7 +1848,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]}
#### get\_embedding\_count
```python
-def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int
+def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int
```
Return the count of embeddings in the document store.
@@ -1747,7 +1881,7 @@ None
#### delete\_all\_documents
```python
-def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete all documents from the document store.
@@ -1757,7 +1891,7 @@ Delete all documents from the document store.
#### delete\_documents
```python
-def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete documents from the document store. All documents are deleted if no filters are passed.
@@ -1782,7 +1916,7 @@ None
#### query\_by\_embedding
```python
-def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
@@ -1902,7 +2036,7 @@ None
#### update\_embeddings
```python
-def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, filters: Optional[Dict[str, List[str]]] = None)
+def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None)
```
Updates the embeddings in the the document store using the encoding model specified in the retriever.
@@ -1930,7 +2064,7 @@ None
#### query\_by\_embedding
```python
-def query_by_embedding(query_emb: np.ndarray, filters: Optional[dict] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
@@ -1953,7 +2087,7 @@ list of Documents that are the most similar to `query_emb`
#### delete\_all\_documents
```python
-def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete all documents (from SQL AND Milvus).
@@ -1973,7 +2107,7 @@ None
#### delete\_documents
```python
-def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete documents in an index. All documents are deleted if no filters are passed.
@@ -1998,7 +2132,7 @@ None
#### get\_all\_documents\_generator
```python
-def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
```
Get all documents from the document store. Under-the-hood, documents are fetched in batches from the
@@ -2020,7 +2154,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]}
#### get\_all\_documents
```python
-def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Get documents from the document store (optionally using filter criteria).
@@ -2091,7 +2225,7 @@ List[np.array]: List of vectors.
#### get\_embedding\_count
```python
-def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int
+def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int
```
Return the count of embeddings in the document store.
@@ -2183,27 +2317,29 @@ None
#### update\_document\_meta
```python
-def update_document_meta(id: str, meta: Dict[str, str], index: str = None)
+def update_document_meta(id: str, meta: Dict[str, Union[List, str, int, float, bool]], index: str = None)
```
Update the metadata dictionary of a document by specifying its string id.
+Overwrites only the specified fields, the unspecified ones remain unchanged.
#### get\_embedding\_count
```python
-def get_embedding_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int
+def get_embedding_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None) -> int
```
-Return the number of embeddings in the document store, which is the same as the number of documents since every document has a default embedding
+Return the number of embeddings in the document store, which is the same as the number of documents since
+every document has a default embedding.
#### get\_document\_count
```python
-def get_document_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int
+def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int
```
Return the number of documents in the document store.
@@ -2213,7 +2349,7 @@ Return the number of documents in the document store.
#### get\_all\_documents
```python
-def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Get documents from the document store.
@@ -2222,8 +2358,32 @@ Get documents from the document store.
- `index`: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
-- `filters`: Optional filters to narrow down the documents to return.
-Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
- `return_embedding`: Whether to return the document embeddings.
- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
@@ -2232,7 +2392,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]}
#### get\_all\_documents\_generator
```python
-def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
```
Get documents from the document store. Under-the-hood, documents are fetched in batches from the
@@ -2244,8 +2404,32 @@ a large number of documents without having to load all documents in memory.
- `index`: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
-- `filters`: Optional filters to narrow down the documents to return.
-Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
- `return_embedding`: Whether to return the document embeddings.
- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
@@ -2254,7 +2438,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]}
#### query
```python
-def query(query: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None) -> List[Document]
+def query(query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@@ -2264,7 +2448,69 @@ that are most relevant to the query as defined by Weaviate semantic search.
**Arguments**:
- `query`: The query
-- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
- `top_k`: How many documents to return per query.
- `custom_query`: Custom query that will executed using query.raw method, for more details refer
https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html
@@ -2275,7 +2521,7 @@ https://www.semi.technology/developers/weaviate/current/graphql-references/filte
#### query\_by\_embedding
```python
-def query_by_embedding(query_emb: np.ndarray, filters: Optional[dict] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
@@ -2283,8 +2529,69 @@ Find the document that is most similar to the provided `query_emb` by using a ve
**Arguments**:
- `query_emb`: Embedding of the query (e.g. gathered from DPR)
-- `filters`: Optional filters to narrow down the search space.
-Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
- `top_k`: How many documents to return
- `index`: index name for storing the docs and metadata
- `return_embedding`: To return document embedding
@@ -2294,7 +2601,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]}
#### update\_embeddings
```python
-def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000)
+def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000)
```
Updates the embeddings in the the document store using the encoding model specified in the retriever.
@@ -2307,8 +2614,32 @@ This can be useful if want to change the embeddings for your documents (e.g. aft
- `index`: Index name to update
- `update_existing_embeddings`: Weaviate mandates an embedding while creating the document itself.
This option must be always true for weaviate and it will update the embeddings for all the documents.
-- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated.
-Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
**Returns**:
@@ -2320,7 +2651,7 @@ None
#### delete\_all\_documents
```python
-def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete documents in an index. All documents are deleted if no filters are passed.
@@ -2328,7 +2659,32 @@ Delete documents in an index. All documents are deleted if no filters are passed
**Arguments**:
- `index`: Index name to delete the document from.
-- `filters`: Optional filters to narrow down the documents to be deleted.
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
**Returns**:
@@ -2339,7 +2695,7 @@ None
#### delete\_documents
```python
-def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None)
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None)
```
Delete documents in an index. All documents are deleted if no filters are passed.
@@ -2349,11 +2705,35 @@ Delete documents in an index. All documents are deleted if no filters are passed
- `index`: Index name to delete the document from. If None, the
DocumentStore's default index (self.index) will be used.
- `ids`: Optional list of IDs to narrow down the documents to be deleted.
-- `filters`: Optional filters to narrow down the documents to be deleted.
-Example filters: {"name": ["some", "more"], "category": ["only_one"]}.
-If filters are provided along with a list of IDs, this method deletes the
-intersection of the two query results (documents that match the filters and
-have their ID in the list).
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+ If filters are provided along with a list of IDs, this method deletes the
+ intersection of the two query results (documents that match the filters and
+ have their ID in the list).
**Returns**:
@@ -2532,7 +2912,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore)
#### get\_all\_documents
```python
-def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Get documents from the document store.
@@ -2541,8 +2921,32 @@ Get documents from the document store.
- `index`: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
-- `filters`: Optional filters to narrow down the documents to return.
-Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
- `return_embedding`: Whether to return the document embeddings.
- `batch_size`: Number of documents that are passed to bulk function at a time.
- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
@@ -2552,7 +2956,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]}
#### get\_all\_documents\_generator
```python
-def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
```
Get documents from the document store. Under-the-hood, documents are fetched in batches from the
@@ -2564,8 +2968,32 @@ a large number of documents without having to load all documents in memory.
- `index`: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
-- `filters`: Optional filters to narrow down the documents to return.
-Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
- `return_embedding`: Whether to return the document embeddings.
- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
@@ -2575,7 +3003,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]}
#### query\_by\_embedding
```python
-def query_by_embedding(query_emb: np.ndarray, filters: Optional[Optional[Dict[str, List[str]]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
@@ -2583,8 +3011,69 @@ Find the document that is most similar to the provided `query_emb` by using a ve
**Arguments**:
- `query_emb`: Embedding of the query (e.g. gathered from DPR)
-- `filters`: Optional filters to narrow down the search space.
-Example: {"name": ["some", "more"], "category": ["only_one"]}
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
- `top_k`: How many documents to return
- `index`: Index name for storing the docs and metadata
- `return_embedding`: To return document embedding
@@ -2595,7 +3084,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]}
#### query
```python
-def query(query: Optional[str], filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@@ -2605,7 +3094,69 @@ that are most relevant to the query as defined by the BM25 algorithm.
**Arguments**:
- `query`: The query
-- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
- `top_k`: How many documents to return per query.
- `custom_query`: Custom query to be executed.
- `index`: The name of the index in the DocumentStore from which to retrieve documents
diff --git a/docs/_src/api/openapi/openapi.json b/docs/_src/api/openapi/openapi.json
index f5fcee5b7..12ddf0ba0 100644
--- a/docs/_src/api/openapi/openapi.json
+++ b/docs/_src/api/openapi/openapi.json
@@ -2,7 +2,7 @@
"openapi": "3.0.2",
"info": {
"title": "Haystack REST API",
- "version": "1.1.0"
+ "version": "1.2.0rc0"
},
"paths": {
"/initialized": {
diff --git a/haystack/document_stores/base.py b/haystack/document_stores/base.py
index 058deb992..9c6f78e67 100644
--- a/haystack/document_stores/base.py
+++ b/haystack/document_stores/base.py
@@ -98,7 +98,7 @@ class BaseDocumentStore(BaseComponent):
def get_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -108,8 +108,33 @@ class BaseDocumentStore(BaseComponent):
:param index: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
- :param filters: Optional filters to narrow down the documents to return.
- Example: {"name": ["some", "more"], "category": ["only_one"]}
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+
:param return_embedding: Whether to return the document embeddings.
:param batch_size: Number of documents that are passed to bulk function at a time.
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
@@ -120,7 +145,7 @@ class BaseDocumentStore(BaseComponent):
def get_all_documents_generator(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -132,8 +157,33 @@ class BaseDocumentStore(BaseComponent):
:param index: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
- :param filters: Optional filters to narrow down the documents to return.
- Example: {"name": ["some", "more"], "category": ["only_one"]}
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+
:param return_embedding: Whether to return the document embeddings.
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
@@ -158,7 +208,7 @@ class BaseDocumentStore(BaseComponent):
def get_all_labels(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
) -> List[Label]:
pass
@@ -166,7 +216,7 @@ class BaseDocumentStore(BaseComponent):
def get_all_labels_aggregated(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
open_domain: bool = True,
drop_negative_labels: bool = False,
drop_no_answers: bool = False,
@@ -188,8 +238,33 @@ class BaseDocumentStore(BaseComponent):
:param index: Name of the index to get the labels from. If None, the
DocumentStore's default index (self.index) will be used.
- :param filters: Optional filters to narrow down the labels to return.
- Example: {"name": ["some", "more"], "category": ["only_one"]}
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+
:param open_domain: When True, labels are aggregated purely based on the question text alone.
When False, labels are aggregated in a closed domain fashion based on the question text
and also the id of the document that the label is tied to. In this setting, this function
@@ -260,7 +335,7 @@ class BaseDocumentStore(BaseComponent):
@abstractmethod
def get_document_count(
self,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
index: Optional[str] = None,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@@ -299,7 +374,7 @@ class BaseDocumentStore(BaseComponent):
def query_by_embedding(
self,
query_emb: np.ndarray,
- filters: Optional[Optional[Dict[str, List[str]]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@@ -413,7 +488,7 @@ class BaseDocumentStore(BaseComponent):
def delete_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
pass
@@ -423,7 +498,7 @@ class BaseDocumentStore(BaseComponent):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
pass
@@ -433,7 +508,7 @@ class BaseDocumentStore(BaseComponent):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
pass
@@ -564,7 +639,7 @@ class KeywordDocumentStore(BaseDocumentStore):
def query(
self,
query: Optional[str],
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
custom_query: Optional[str] = None,
index: Optional[str] = None,
@@ -575,7 +650,70 @@ class KeywordDocumentStore(BaseDocumentStore):
that are most relevant to the query as defined by keyword matching algorithms like BM25.
:param query: The query
- :param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
+
:param top_k: How many documents to return per query.
:param custom_query: Custom query to be executed.
:param index: The name of the index in the DocumentStore from which to retrieve documents
diff --git a/haystack/document_stores/deepsetcloud.py b/haystack/document_stores/deepsetcloud.py
index d806420a2..3a76f2313 100644
--- a/haystack/document_stores/deepsetcloud.py
+++ b/haystack/document_stores/deepsetcloud.py
@@ -77,7 +77,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -87,8 +87,32 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
:param index: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
- :param filters: Optional filters to narrow down the documents to return.
- Example: {"name": ["some", "more"], "category": ["only_one"]}
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
:param return_embedding: Whether to return the document embeddings.
:param batch_size: Number of documents that are passed to bulk function at a time.
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
@@ -106,7 +130,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -118,8 +142,32 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
:param index: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
- :param filters: Optional filters to narrow down the documents to return.
- Example: {"name": ["some", "more"], "category": ["only_one"]}
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
:param return_embedding: Whether to return the document embeddings.
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
@@ -168,7 +216,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def get_document_count(
self,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
index: Optional[str] = None,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@@ -184,7 +232,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
- filters: Optional[Optional[Dict[str, List[str]]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@@ -194,8 +242,69 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
:param query_emb: Embedding of the query (e.g. gathered from DPR)
- :param filters: Optional filters to narrow down the search space.
- Example: {"name": ["some", "more"], "category": ["only_one"]}
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
:param top_k: How many documents to return
:param index: Index name for storing the docs and metadata
:param return_embedding: To return document embedding
@@ -220,7 +329,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def query(
self,
query: Optional[str],
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
custom_query: Optional[str] = None,
index: Optional[str] = None,
@@ -231,7 +340,69 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
that are most relevant to the query as defined by the BM25 algorithm.
:param query: The query
- :param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
:param top_k: How many documents to return per query.
:param custom_query: Custom query to be executed.
:param index: The name of the index in the DocumentStore from which to retrieve documents
@@ -280,7 +451,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def get_all_labels(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
) -> List[Label]:
raise NotImplementedError("DeepsetCloudDocumentStore currently does not support labels.")
@@ -299,7 +470,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
raise NotImplementedError("DeepsetCloudDocumentStore currently does not support deleting documents.")
@@ -308,7 +479,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
raise NotImplementedError("DeepsetCloudDocumentStore currently does not support deleting documents.")
@@ -317,7 +488,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
raise NotImplementedError("DeepsetCloudDocumentStore currently does not support labels.")
diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py
index 563f4a9ed..67a45e722 100644
--- a/haystack/document_stores/elasticsearch.py
+++ b/haystack/document_stores/elasticsearch.py
@@ -478,7 +478,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
self,
key: str,
query: Optional[str] = None,
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
index: Optional[str] = None,
headers: Optional[Dict[str, str]] = None,
) -> List[dict]:
@@ -499,7 +499,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
- Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -705,7 +705,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def get_document_count(
self,
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
index: Optional[str] = None,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@@ -736,7 +736,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def get_embedding_count(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
) -> int:
"""
@@ -756,7 +756,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -777,7 +777,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
- Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -805,7 +805,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -828,7 +828,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
- Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -862,7 +862,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def get_all_labels(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
batch_size: int = 10_000,
) -> List[Label]:
@@ -879,7 +879,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def _get_all_documents_in_index(
self,
index: str,
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
batch_size: int = 10_000,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@@ -901,7 +901,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def query(
self,
query: Optional[str],
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
custom_query: Optional[str] = None,
index: Optional[str] = None,
@@ -924,7 +924,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
- Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -949,10 +949,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
}
```
- To use the same logical operator multiple times on the same level, logical operators take
- optionally a list of dictionaries as value.
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
- Example:
+ __Example__:
```python
filters = {
"$or": [
@@ -1105,7 +1105,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@@ -1127,7 +1127,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
- Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -1152,10 +1152,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
}
```
- To use the same logical operator multiple times on the same level, logical operators take
- optionally a list of dictionaries as value.
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
- Example:
+ __Example__:
```python
filters = {
"$or": [
@@ -1348,7 +1348,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
self,
retriever,
index: Optional[str] = None,
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
update_existing_embeddings: bool = True,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -1374,7 +1374,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
- Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -1449,7 +1449,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
"""
@@ -1467,7 +1467,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
- Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -1497,7 +1497,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
"""
@@ -1517,7 +1517,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
- Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -1532,9 +1532,9 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
}
```
- If filters are provided along with a list of IDs, this method deletes the
- intersection of the two query results (documents that match the filters and
- have their ID in the list).
+ If filters are provided along with a list of IDs, this method deletes the
+ intersection of the two query results (documents that match the filters and
+ have their ID in the list).
:param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
:return: None
@@ -1560,7 +1560,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
"""
@@ -1580,7 +1580,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
- Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -1695,7 +1695,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
- filters: Optional[Dict[str, Any]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@@ -1717,7 +1717,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
- Example:
+ __Example__:
```python
filters = {
"$and": {
@@ -1742,10 +1742,10 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
}
```
- To use the same logical operator multiple times on the same level, logical operators take
- optionally a list of dictionaries as value.
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
- Example:
+ __Example__:
```python
filters = {
"$or": [
diff --git a/haystack/document_stores/faiss.py b/haystack/document_stores/faiss.py
index 757bc4e6a..ea64fbede 100644
--- a/haystack/document_stores/faiss.py
+++ b/haystack/document_stores/faiss.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from haystack.nodes.retriever import BaseRetriever
@@ -308,7 +308,7 @@ class FAISSDocumentStore(SQLDocumentStore):
retriever: "BaseRetriever",
index: Optional[str] = None,
update_existing_embeddings: bool = True,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
batch_size: int = 10_000,
):
"""
@@ -379,7 +379,7 @@ class FAISSDocumentStore(SQLDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -396,7 +396,7 @@ class FAISSDocumentStore(SQLDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -447,7 +447,7 @@ class FAISSDocumentStore(SQLDocumentStore):
doc.embedding = self.faiss_indexes[index].reconstruct(int(doc.meta["vector_id"]))
return documents
- def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int:
+ def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int:
"""
Return the count of embeddings in the document store.
"""
@@ -486,7 +486,7 @@ class FAISSDocumentStore(SQLDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@@ -507,7 +507,7 @@ class FAISSDocumentStore(SQLDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@@ -546,7 +546,7 @@ class FAISSDocumentStore(SQLDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
diff --git a/haystack/document_stores/filter_utils.py b/haystack/document_stores/filter_utils.py
index a45b98734..6b97a0f6b 100644
--- a/haystack/document_stores/filter_utils.py
+++ b/haystack/document_stores/filter_utils.py
@@ -1,9 +1,11 @@
-from typing import Union, List, Dict
+from typing import Union, List, Dict, Optional, Tuple
from abc import ABC, abstractmethod
from collections import defaultdict
+from haystack.document_stores.utils import convert_date_to_rfc3339
-def nested_defaultdict():
+
+def nested_defaultdict() -> defaultdict:
"""
Data structure that recursively adds a dictionary as value if a key does not exist. Advantage: In nested dictionary
structures, we don't need to check if a key already exists (which can become hard to maintain in nested dictionaries
@@ -81,17 +83,17 @@ class LogicalFilterClause(ABC):
"""
- def __init__(self, conditions: List["LogicalFilterClause"]):
+ def __init__(self, conditions: List[Union["LogicalFilterClause", "ComparisonOperation"]]):
self.conditions = conditions
@classmethod
- def parse(cls, filter_term: Union[dict, List[dict]]):
+ def parse(cls, filter_term: Union[dict, List[dict]]) -> Union["LogicalFilterClause", "ComparisonOperation"]:
"""
Parses a filter dictionary/list and returns a LogicalFilterClause instance.
:param filter_term: Dictionary or list that contains the filter definition.
"""
- conditions = []
+ conditions: List[Union[LogicalFilterClause, ComparisonOperation]] = []
if isinstance(filter_term, dict):
filter_term = [filter_term]
@@ -122,7 +124,14 @@ class LogicalFilterClause(ABC):
"""
pass
- def _merge_es_range_queries(self, conditions: List[Dict]) -> List[Dict]:
+ @abstractmethod
+ def convert_to_weaviate(self):
+ """
+ Converts the LogicalFilterClause instance to a Weaviate filter.
+ """
+ pass
+
+ def _merge_es_range_queries(self, conditions: List[Dict]) -> List[Dict[str, Dict]]:
"""
Merges Elasticsearch range queries that perform on the same metadata field.
"""
@@ -142,14 +151,23 @@ class LogicalFilterClause(ABC):
return conditions
+ @abstractmethod
+ def invert(self) -> Union["LogicalFilterClause", "ComparisonOperation"]:
+ """
+ Inverts the LogicalOperation instance.
+ Necessary for Weaviate as Weaviate doesn't seem to support the 'Not' operator anymore.
+ (https://github.com/semi-technologies/weaviate/issues/1717)
+ """
+ pass
+
class ComparisonOperation(ABC):
- def __init__(self, field_name: str, comparison_value: Union[str, float, List]):
+ def __init__(self, field_name: str, comparison_value: Union[str, int, float, bool, List]):
self.field_name = field_name
self.comparison_value = comparison_value
@classmethod
- def parse(cls, field_name, comparison_clause: Union[Dict, List, str, float]):
+ def parse(cls, field_name, comparison_clause: Union[Dict, List, str, float]) -> List["ComparisonOperation"]:
comparison_operations: List[ComparisonOperation] = []
if isinstance(comparison_clause, dict):
@@ -187,107 +205,273 @@ class ComparisonOperation(ABC):
"""
pass
+ @abstractmethod
+ def convert_to_weaviate(self):
+ """
+ Converts the ComparisonOperation instance to a Weaviate comparison operator.
+ """
+ pass
+
+ @abstractmethod
+ def invert(self) -> "ComparisonOperation":
+ """
+ Inverts the ComparisonOperation.
+ Necessary for Weaviate as Weaviate doesn't seem to support the 'Not' operator anymore.
+ (https://github.com/semi-technologies/weaviate/issues/1717)
+ """
+ pass
+
+ def _get_weaviate_datatype(
+ self, value: Optional[Union[str, int, float, bool]] = None
+ ) -> Tuple[str, Union[str, int, float, bool]]:
+ """
+ Determines the type of the comparison value and converts it to RFC3339 format if it is as date,
+ as Weaviate requires dates to be in RFC3339 format including the time and timezone.
+
+ """
+ if value is None:
+ assert not isinstance(self.comparison_value, list) # Necessary for mypy
+ value = self.comparison_value
+
+ if isinstance(value, str):
+ # Check if comparison value is a date
+ try:
+ value = convert_date_to_rfc3339(value)
+ data_type = "valueDate"
+ # Comparison value is a plain string
+ except ValueError:
+ data_type = "valueString"
+ elif isinstance(value, int):
+ data_type = "valueInt"
+ elif isinstance(value, float):
+ data_type = "valueNumber"
+ elif isinstance(value, bool):
+ data_type = "valueBoolean"
+ else:
+ raise ValueError(
+ f"Unsupported data type of comparison value for {self.__class__.__name__}."
+ f"Value needs to be of type str, int, float, or bool."
+ )
+
+ return data_type, value
+
class NotOperation(LogicalFilterClause):
"""
Handles conversion of logical 'NOT' operations.
"""
- def convert_to_elasticsearch(self):
+ def convert_to_elasticsearch(self) -> Dict[str, Dict]:
conditions = [condition.convert_to_elasticsearch() for condition in self.conditions]
conditions = self._merge_es_range_queries(conditions)
return {"bool": {"must_not": conditions}}
+ def convert_to_weaviate(self) -> Dict[str, Union[str, int, float, bool, List[Dict]]]:
+ conditions = [condition.invert().convert_to_weaviate() for condition in self.conditions]
+ if len(conditions) > 1:
+ # Conditions in self.conditions are by default combined with AND which becomes OR according to DeMorgan
+ return {"operator": "Or", "operands": conditions}
+ else:
+ return conditions[0]
+
+ def invert(self) -> Union[LogicalFilterClause, ComparisonOperation]:
+ # This method is called when a "$not" operation is embedded in another "$not" operation. Therefore, we don't
+ # invert the operations here, as two "$not" operation annihilate each other.
+ # (If we have more than one condition, we return an AndOperation, the default logical operation for combining
+ # multiple conditions.)
+ if len(self.conditions) > 1:
+ return AndOperation(self.conditions)
+ else:
+ return self.conditions[0]
+
class AndOperation(LogicalFilterClause):
"""
Handles conversion of logical 'AND' operations.
"""
- def convert_to_elasticsearch(self):
+ def convert_to_elasticsearch(self) -> Dict[str, Dict]:
conditions = [condition.convert_to_elasticsearch() for condition in self.conditions]
conditions = self._merge_es_range_queries(conditions)
return {"bool": {"must": conditions}}
+ def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
+ conditions = [condition.convert_to_weaviate() for condition in self.conditions]
+ return {"operator": "And", "operands": conditions}
+
+ def invert(self) -> "OrOperation":
+ return OrOperation([condition.invert() for condition in self.conditions])
+
class OrOperation(LogicalFilterClause):
"""
Handles conversion of logical 'OR' operations.
"""
- def convert_to_elasticsearch(self):
+ def convert_to_elasticsearch(self) -> Dict[str, Dict]:
conditions = [condition.convert_to_elasticsearch() for condition in self.conditions]
conditions = self._merge_es_range_queries(conditions)
return {"bool": {"should": conditions}}
+ def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
+ conditions = [condition.convert_to_weaviate() for condition in self.conditions]
+ return {"operator": "Or", "operands": conditions}
+
+ def invert(self) -> AndOperation:
+ return AndOperation([condition.invert() for condition in self.conditions])
+
class EqOperation(ComparisonOperation):
"""
Handles conversion of the '$eq' comparison operation.
"""
- def convert_to_elasticsearch(self):
+ def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Union[str, int, float, bool]]]:
+ assert not isinstance(self.comparison_value, list), "Use '$in' operation for lists as comparison values."
return {"term": {self.field_name: self.comparison_value}}
+ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, bool]]:
+ comp_value_type, comp_value = self._get_weaviate_datatype()
+ return {"path": [self.field_name], "operator": "Equal", comp_value_type: comp_value}
+
+ def invert(self) -> "NeOperation":
+ return NeOperation(self.field_name, self.comparison_value)
+
class InOperation(ComparisonOperation):
"""
Handles conversion of the '$in' comparison operation.
"""
- def convert_to_elasticsearch(self):
+ def convert_to_elasticsearch(self) -> Dict[str, Dict[str, List]]:
+ assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list."
return {"terms": {self.field_name: self.comparison_value}}
+ def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
+ filter_dict: Dict[str, Union[str, List[Dict]]] = {"operator": "Or", "operands": []}
+ assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list."
+ for value in self.comparison_value:
+ comp_value_type, comp_value = self._get_weaviate_datatype(value)
+ assert isinstance(filter_dict["operands"], list) # Necessary for mypy
+ filter_dict["operands"].append(
+ {"path": [self.field_name], "operator": "Equal", comp_value_type: comp_value}
+ )
+
+ return filter_dict
+
+ def invert(self) -> "NinOperation":
+ return NinOperation(self.field_name, self.comparison_value)
+
class NeOperation(ComparisonOperation):
"""
Handles conversion of the '$ne' comparison operation.
"""
- def convert_to_elasticsearch(self):
+ def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Dict[str, Union[str, int, float, bool]]]]]:
+ assert not isinstance(self.comparison_value, list), "Use '$nin' operation for lists as comparison values."
return {"bool": {"must_not": {"term": {self.field_name: self.comparison_value}}}}
+ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, bool]]:
+ comp_value_type, comp_value = self._get_weaviate_datatype()
+ return {"path": [self.field_name], "operator": "NotEqual", comp_value_type: comp_value}
+
+ def invert(self) -> "EqOperation":
+ return EqOperation(self.field_name, self.comparison_value)
+
class NinOperation(ComparisonOperation):
"""
Handles conversion of the '$nin' comparison operation.
"""
- def convert_to_elasticsearch(self):
+ def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Dict[str, List]]]]:
+ assert isinstance(self.comparison_value, list), "'$nin' operation requires comparison value to be a list."
return {"bool": {"must_not": {"terms": {self.field_name: self.comparison_value}}}}
+ def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
+ filter_dict: Dict[str, Union[str, List[Dict]]] = {"operator": "And", "operands": []}
+ assert isinstance(self.comparison_value, list), "'$nin' operation requires comparison value to be a list."
+ for value in self.comparison_value:
+ comp_value_type, comp_value = self._get_weaviate_datatype(value)
+ assert isinstance(filter_dict["operands"], list) # Necessary for mypy
+ filter_dict["operands"].append(
+ {"path": [self.field_name], "operator": "NotEqual", comp_value_type: comp_value}
+ )
+
+ return filter_dict
+
+ def invert(self) -> "InOperation":
+ return InOperation(self.field_name, self.comparison_value)
+
class GtOperation(ComparisonOperation):
"""
Handles conversion of the '$gt' comparison operation.
"""
- def convert_to_elasticsearch(self):
+ def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
+ assert not isinstance(self.comparison_value, list), "Comparison value for '$gt' operation must not be a list."
return {"range": {self.field_name: {"gt": self.comparison_value}}}
+ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
+ comp_value_type, comp_value = self._get_weaviate_datatype()
+ assert not isinstance(comp_value, list), "Comparison value for '$gt' operation must not be a list."
+ return {"path": [self.field_name], "operator": "GreaterThan", comp_value_type: comp_value}
+
+ def invert(self) -> "LteOperation":
+ return LteOperation(self.field_name, self.comparison_value)
+
class GteOperation(ComparisonOperation):
"""
Handles conversion of the '$gte' comparison operation.
"""
- def convert_to_elasticsearch(self):
+ def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
+ assert not isinstance(self.comparison_value, list), "Comparison value for '$gte' operation must not be a list."
return {"range": {self.field_name: {"gte": self.comparison_value}}}
+ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
+ comp_value_type, comp_value = self._get_weaviate_datatype()
+ assert not isinstance(comp_value, list), "Comparison value for '$gte' operation must not be a list."
+ return {"path": [self.field_name], "operator": "GreaterThanEqual", comp_value_type: comp_value}
+
+ def invert(self) -> "LtOperation":
+ return LtOperation(self.field_name, self.comparison_value)
+
class LtOperation(ComparisonOperation):
"""
Handles conversion of the '$lt' comparison operation.
"""
- def convert_to_elasticsearch(self):
+ def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
+ assert not isinstance(self.comparison_value, list), "Comparison value for '$lt' operation must not be a list."
return {"range": {self.field_name: {"lt": self.comparison_value}}}
+ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
+ comp_value_type, comp_value = self._get_weaviate_datatype()
+ assert not isinstance(comp_value, list), "Comparison value for '$lt' operation must not be a list."
+ return {"path": [self.field_name], "operator": "LessThan", comp_value_type: comp_value}
+
+ def invert(self) -> "GteOperation":
+ return GteOperation(self.field_name, self.comparison_value)
+
class LteOperation(ComparisonOperation):
"""
Handles conversion of the '$lte' comparison operation.
"""
- def convert_to_elasticsearch(self):
+ def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
+ assert not isinstance(self.comparison_value, list), "Comparison value for '$lte' operation must not be a list."
return {"range": {self.field_name: {"lte": self.comparison_value}}}
+
+ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
+ comp_value_type, comp_value = self._get_weaviate_datatype()
+ assert not isinstance(comp_value, list), "Comparison value for '$lte' operation must not be a list."
+ return {"path": [self.field_name], "operator": "LessThanEqual", comp_value_type: comp_value}
+
+ def invert(self) -> "GtOperation":
+ return GtOperation(self.field_name, self.comparison_value)
diff --git a/haystack/document_stores/memory.py b/haystack/document_stores/memory.py
index 6745ceb12..583c09e61 100644
--- a/haystack/document_stores/memory.py
+++ b/haystack/document_stores/memory.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Dict, List, Optional, Union, Generator
+from typing import TYPE_CHECKING, Dict, List, Optional, Union, Generator, Any
if TYPE_CHECKING:
from haystack.nodes.retriever import BaseRetriever
@@ -291,7 +291,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@@ -337,7 +337,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
self,
retriever: "BaseRetriever",
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
update_existing_embeddings: bool = True,
batch_size: int = 10_000,
):
@@ -390,7 +390,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def get_document_count(
self,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
index: Optional[str] = None,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@@ -427,7 +427,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def _query(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
return_embedding: Optional[bool] = None,
only_documents_without_embedding: bool = False,
):
@@ -464,7 +464,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -490,7 +490,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -514,7 +514,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def get_all_labels(
self,
index: str = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
headers: Optional[Dict[str, str]] = None,
) -> List[Label]:
"""
@@ -544,7 +544,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@@ -569,7 +569,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@@ -603,7 +603,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
diff --git a/haystack/document_stores/milvus.py b/haystack/document_stores/milvus.py
index dbb1beb08..242307094 100644
--- a/haystack/document_stores/milvus.py
+++ b/haystack/document_stores/milvus.py
@@ -307,7 +307,7 @@ class MilvusDocumentStore(SQLDocumentStore):
index: Optional[str] = None,
batch_size: int = 10_000,
update_existing_embeddings: bool = True,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
):
"""
Updates the embeddings in the the document store using the encoding model specified in the retriever.
@@ -374,7 +374,7 @@ class MilvusDocumentStore(SQLDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
- filters: Optional[dict] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@@ -440,7 +440,7 @@ class MilvusDocumentStore(SQLDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@@ -465,7 +465,7 @@ class MilvusDocumentStore(SQLDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@@ -508,7 +508,7 @@ class MilvusDocumentStore(SQLDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -541,7 +541,7 @@ class MilvusDocumentStore(SQLDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -676,7 +676,11 @@ class MilvusDocumentStore(SQLDocumentStore):
return vectors
- def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int:
+ def get_embedding_count(
+ self,
+ index: Optional[str] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
+ ) -> int:
"""
Return the count of embeddings in the document store.
"""
diff --git a/haystack/document_stores/milvus2x.py b/haystack/document_stores/milvus2x.py
index bd3009795..1b28c2cfa 100644
--- a/haystack/document_stores/milvus2x.py
+++ b/haystack/document_stores/milvus2x.py
@@ -375,7 +375,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
index: Optional[str] = None,
batch_size: int = 10_000,
update_existing_embeddings: bool = True,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
):
"""
Updates the embeddings in the the document store using the encoding model specified in the retriever.
@@ -457,7 +457,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
- filters: Optional[dict] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@@ -538,7 +538,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@@ -571,7 +571,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -604,7 +604,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py
index 1948966d8..39fa07a85 100644
--- a/haystack/document_stores/sql.py
+++ b/haystack/document_stores/sql.py
@@ -215,7 +215,7 @@ class SQLDocumentStore(BaseDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -233,7 +233,7 @@ class SQLDocumentStore(BaseDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -271,7 +271,7 @@ class SQLDocumentStore(BaseDocumentStore):
def _query(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
vector_ids: Optional[List[str]] = None,
only_documents_without_embedding: bool = False,
batch_size: int = 10_000,
@@ -521,7 +521,7 @@ class SQLDocumentStore(BaseDocumentStore):
def get_document_count(
self,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
index: Optional[str] = None,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@@ -609,7 +609,7 @@ class SQLDocumentStore(BaseDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@@ -634,7 +634,7 @@ class SQLDocumentStore(BaseDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@@ -674,7 +674,7 @@ class SQLDocumentStore(BaseDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
diff --git a/haystack/document_stores/utils.py b/haystack/document_stores/utils.py
index 1e7c74c7c..5743a1502 100644
--- a/haystack/document_stores/utils.py
+++ b/haystack/document_stores/utils.py
@@ -2,6 +2,7 @@ from typing import Dict, List, Optional, Tuple, Union, Generator
import json
import logging
+from datetime import datetime
from haystack.schema import Document, Label, Answer, Span
from haystack.nodes.preprocessor import PreProcessor
@@ -250,3 +251,23 @@ def _extract_docs_and_labels_from_dict(
labels.append(label)
return docs, labels, problematic_ids
+
+
+def convert_date_to_rfc3339(date: str) -> str:
+ """
+ Converts a date to RFC3339 format, as Weaviate requires dates to be in RFC3339 format including the time and
+ timezone.
+
+ If the provided date string does not contain a time and/or timezone, we use 00:00 as default time
+ and UTC as default time zone.
+
+ This method cannot be part of WeaviateDocumentStore, as this would result in a circular import between weaviate.py
+ and filter_utils.py.
+ """
+ parsed_datetime = datetime.fromisoformat(date)
+ if parsed_datetime.utcoffset() is None:
+ converted_date = parsed_datetime.isoformat() + "Z"
+ else:
+ converted_date = parsed_datetime.isoformat()
+
+ return converted_date
diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py
index 61a2a3d1d..e4c110f28 100644
--- a/haystack/document_stores/weaviate.py
+++ b/haystack/document_stores/weaviate.py
@@ -2,6 +2,7 @@ import hashlib
import re
import uuid
from typing import Dict, Generator, List, Optional, Union
+from datetime import datetime
import logging
import json
@@ -11,6 +12,8 @@ from tqdm import tqdm
from haystack.schema import Document
from haystack.document_stores import BaseDocumentStore
from haystack.document_stores.base import get_batches_from_generator
+from haystack.document_stores.filter_utils import LogicalFilterClause
+from haystack.document_stores.utils import convert_date_to_rfc3339
try:
from weaviate import client, AuthClientPassword
@@ -225,8 +228,8 @@ class WeaviateDocumentStore(BaseDocumentStore):
content = json.loads(str(props.get(self.content_field)))
content_type = None
- if props.get("contenttype") is not None:
- content_type = str(props.pop("contenttype"))
+ if props.get("content_type") is not None:
+ content_type = str(props.pop("content_type"))
# Weaviate creates "_additional" key for semantic search
if "_additional" in props:
@@ -337,30 +340,61 @@ class WeaviateDocumentStore(BaseDocumentStore):
return cur_properties
- def _build_filter_clause(self, filters: Dict[str, List[str]]) -> dict:
+ def _get_date_properties(self, index: Optional[str] = None) -> List[str]:
"""
- Transform Haystack filter conditions to Weaviate where filter clauses.
+ Get all existing properties of type 'date' in the schema.
"""
- weaviate_filters = []
- weaviate_filter = {}
- for key, values in filters.items():
- for value in values:
- weaviate_filter = {"path": [key], "operator": "Equal", "valueString": value}
- weaviate_filters.append(weaviate_filter)
- if len(weaviate_filters) > 1:
- filter_dict = {"operator": "Or", "operands": weaviate_filters}
- return filter_dict
- else:
- return weaviate_filter
+ index = self._sanitize_index_name(index) or self.index
+ cur_properties = []
+ for class_item in self.weaviate_client.schema.get()["classes"]:
+ if class_item["class"] == index:
+ cur_properties = [item["name"] for item in class_item["properties"] if item["dataType"][0] == "date"]
- def _update_schema(self, new_prop: str, index: Optional[str] = None):
+ return cur_properties
+
+ def _update_schema(
+ self, new_prop: str, property_value: Union[List, str, int, float, bool], index: Optional[str] = None
+ ):
"""
Updates the schema with a new property.
"""
index = self._sanitize_index_name(index) or self.index
- property_dict = {"dataType": ["string"], "description": f"dynamic property {new_prop}", "name": new_prop}
+ data_type = self._get_weaviate_type_of_value(property_value)
+
+ property_dict = {"dataType": [data_type], "description": f"dynamic property {new_prop}", "name": new_prop}
self.weaviate_client.schema.property.create(index, property_dict)
+ @staticmethod
+ def _get_weaviate_type_of_value(value: Union[List, str, int, float, bool]) -> str:
+ """
+ Infers corresponding Weaviate data type for a value.
+ """
+ data_type = ""
+ list_of_values = False
+ if isinstance(value, list):
+ list_of_values = True
+ value = value[0]
+
+ if isinstance(value, str):
+ # If the value is parsable by datetime, it is a date
+ try:
+ convert_date_to_rfc3339(value)
+ data_type = "date"
+ # Otherwise, the value is a string
+ except ValueError:
+ data_type = "string"
+ elif isinstance(value, int):
+ data_type = "int"
+ elif isinstance(value, float):
+ data_type = "number"
+ elif isinstance(value, bool):
+ data_type = "boolean"
+
+ if list_of_values:
+ data_type += "[]"
+
+ return data_type
+
def _check_document(self, cur_props: List[str], doc: dict) -> List[str]:
"""
Find the properties in the document that don't exist in the existing schema.
@@ -458,9 +492,6 @@ class WeaviateDocumentStore(BaseDocumentStore):
if self.similarity == "cosine":
self.normalize_embedding(vector)
- # rename as weaviate doesn't like "_" in field names
- _doc["contenttype"] = _doc.pop("content_type")
-
# Converting content to JSON-string as Weaviate doesn't allow other nested list for tables
_doc["content"] = json.dumps(_doc["content"])
@@ -469,9 +500,14 @@ class WeaviateDocumentStore(BaseDocumentStore):
missing_props = self._check_document(current_properties, _doc)
if missing_props:
for property in missing_props:
- self._update_schema(property, index)
+ self._update_schema(property, _doc[property], index)
current_properties.append(property)
+ # Weaviate requires dates to be in RFC3339 format
+ date_fields = self._get_date_properties(index)
+ for date_field in date_fields:
+ _doc[date_field] = convert_date_to_rfc3339(_doc[date_field])
+
docs_batch.add(_doc, class_name=index, uuid=doc_id, vector=vector)
# Ingest a batch of documents
@@ -489,23 +525,43 @@ class WeaviateDocumentStore(BaseDocumentStore):
progress_bar.update(batch_size)
progress_bar.close()
- def update_document_meta(self, id: str, meta: Dict[str, str], index: str = None):
+ def update_document_meta(self, id: str, meta: Dict[str, Union[List, str, int, float, bool]], index: str = None):
"""
Update the metadata dictionary of a document by specifying its string id.
+ Overwrites only the specified fields, the unspecified ones remain unchanged.
"""
if not index:
index = self.index
+
+ current_properties = self._get_current_properties(index)
+
+ # Check if the new metadata contains additional properties and append them to the schema
+ missing_props = self._check_document(current_properties, meta)
+ if missing_props:
+ for property in missing_props:
+ self._update_schema(property, meta[property], index)
+ current_properties.append(property)
+
+ # Weaviate requires dates to be in RFC3339 format
+ date_fields = self._get_date_properties(index)
+ for date_field in date_fields:
+ if isinstance(meta[date_field], str):
+ meta[date_field] = convert_date_to_rfc3339(str(meta[date_field]))
+
self.weaviate_client.data_object.update(meta, class_name=index, uuid=id)
- def get_embedding_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
+ def get_embedding_count(
+ self, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None
+ ) -> int:
"""
- Return the number of embeddings in the document store, which is the same as the number of documents since every document has a default embedding
+ Return the number of embeddings in the document store, which is the same as the number of documents since
+ every document has a default embedding.
"""
return self.get_document_count(filters=filters, index=index)
def get_document_count(
self,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
index: Optional[str] = None,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@@ -522,7 +578,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
index = self._sanitize_index_name(index) or self.index
doc_count = 0
if filters:
- filter_dict = self._build_filter_clause(filters=filters)
+ filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
result = (
self.weaviate_client.query.aggregate(index).with_fields("meta { count }").with_where(filter_dict).do()
)
@@ -538,7 +594,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -548,8 +604,32 @@ class WeaviateDocumentStore(BaseDocumentStore):
:param index: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
- :param filters: Optional filters to narrow down the documents to return.
- Example: {"name": ["some", "more"], "category": ["only_one"]}
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
:param return_embedding: Whether to return the document embeddings.
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
"""
@@ -566,7 +646,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
def _get_all_documents_in_index(
self,
index: Optional[str],
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
batch_size: int = 10_000,
only_documents_without_embedding: bool = False,
) -> Generator[dict, None, None]:
@@ -580,7 +660,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
properties.append("_additional {id, certainty, vector}")
if filters:
- filter_dict = self._build_filter_clause(filters=filters)
+ filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
result = (
self.weaviate_client.query.get(class_name=index, properties=properties).with_where(filter_dict).do()
)
@@ -597,7 +677,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@@ -609,8 +689,32 @@ class WeaviateDocumentStore(BaseDocumentStore):
:param index: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
- :param filters: Optional filters to narrow down the documents to return.
- Example: {"name": ["some", "more"], "category": ["only_one"]}
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
:param return_embedding: Whether to return the document embeddings.
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
"""
@@ -630,7 +734,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
def query(
self,
query: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
custom_query: Optional[str] = None,
index: Optional[str] = None,
@@ -640,7 +744,69 @@ class WeaviateDocumentStore(BaseDocumentStore):
that are most relevant to the query as defined by Weaviate semantic search.
:param query: The query
- :param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
:param top_k: How many documents to return per query.
:param custom_query: Custom query that will executed using query.raw method, for more details refer
https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html
@@ -655,7 +821,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
if custom_query:
query_output = self.weaviate_client.query.raw(custom_query)
elif filters:
- filter_dict = self._build_filter_clause(filters)
+ filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
query_output = (
self.weaviate_client.query.get(class_name=index, properties=properties)
.with_where(filter_dict)
@@ -684,7 +850,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
- filters: Optional[dict] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@@ -694,8 +860,69 @@ class WeaviateDocumentStore(BaseDocumentStore):
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
:param query_emb: Embedding of the query (e.g. gathered from DPR)
- :param filters: Optional filters to narrow down the search space.
- Example: {"name": ["some", "more"], "category": ["only_one"]}
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ # or simpler using default operators
+ filters = {
+ "type": "article",
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": ["economy", "politics"],
+ "publisher": "nytimes"
+ }
+ }
+ ```
+
+ To use the same logical operator multiple times on the same level, logical operators take
+ optionally a list of dictionaries as value.
+
+ __Example__:
+ ```python
+ filters = {
+ "$or": [
+ {
+ "$and": {
+ "Type": "News Paper",
+ "Date": {
+ "$lt": "2019-01-01"
+ }
+ }
+ },
+ {
+ "$and": {
+ "Type": "Blog Post",
+ "Date": {
+ "$gte": "2019-01-01"
+ }
+ }
+ }
+ ]
+ }
+ ```
:param top_k: How many documents to return
:param index: index name for storing the docs and metadata
:param return_embedding: To return document embedding
@@ -719,7 +946,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
query_string = {"vector": query_emb}
if filters:
- filter_dict = self._build_filter_clause(filters)
+ filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
query_output = (
self.weaviate_client.query.get(class_name=index, properties=properties)
.with_where(filter_dict)
@@ -751,7 +978,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
self,
retriever,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
update_existing_embeddings: bool = True,
batch_size: int = 10_000,
):
@@ -763,8 +990,32 @@ class WeaviateDocumentStore(BaseDocumentStore):
:param index: Index name to update
:param update_existing_embeddings: Weaviate mandates an embedding while creating the document itself.
This option must be always true for weaviate and it will update the embeddings for all the documents.
- :param filters: Optional filters to narrow down the documents for which embeddings are to be updated.
- Example: {"name": ["some", "more"], "category": ["only_one"]}
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
:return: None
"""
@@ -808,13 +1059,38 @@ class WeaviateDocumentStore(BaseDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
"""
Delete documents in an index. All documents are deleted if no filters are passed.
:param index: Index name to delete the document from.
- :param filters: Optional filters to narrow down the documents to be deleted.
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
:return: None
"""
if headers:
@@ -832,7 +1108,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
"""
@@ -841,11 +1117,35 @@ class WeaviateDocumentStore(BaseDocumentStore):
:param index: Index name to delete the document from. If None, the
DocumentStore's default index (self.index) will be used.
:param ids: Optional list of IDs to narrow down the documents to be deleted.
- :param filters: Optional filters to narrow down the documents to be deleted.
- Example filters: {"name": ["some", "more"], "category": ["only_one"]}.
- If filters are provided along with a list of IDs, this method deletes the
- intersection of the two query results (documents that match the filters and
- have their ID in the list).
+ :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+ conditions.
+ Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+ operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+ `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+ Logical operator keys take a dictionary of metadata field names and/or logical operators as
+ value. Metadata field names take a dictionary of comparison operators as value. Comparison
+ operator keys take a single value or (in case of `"$in"`) a list of values as value.
+ If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+ operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+ operation.
+
+ __Example__:
+ ```python
+ filters = {
+ "$and": {
+ "type": {"$eq": "article"},
+ "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+ "rating": {"$gte": 3},
+ "$or": {
+ "genre": {"$in": ["economy", "politics"]},
+ "publisher": {"$eq": "nytimes"}
+ }
+ }
+ }
+ ```
+ If filters are provided along with a list of IDs, this method deletes the
+ intersection of the two query results (documents that match the filters and
+ have their ID in the list).
:return: None
"""
if headers:
diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py
index 05ececaaa..f715f48de 100644
--- a/haystack/nodes/reader/farm.py
+++ b/haystack/nodes/reader/farm.py
@@ -873,7 +873,7 @@ class FARMReader(BaseReader):
)
# extract all questions for evaluation
- filters = {"origin": [label_origin]}
+ filters: Dict = {"origin": [label_origin]}
labels = document_store.get_all_labels(index=label_index, filters=filters)
diff --git a/haystack/nodes/retriever/base.py b/haystack/nodes/retriever/base.py
index 2a25fe4dd..1551a7a0c 100644
--- a/haystack/nodes/retriever/base.py
+++ b/haystack/nodes/retriever/base.py
@@ -124,7 +124,7 @@ class BaseRetriever(BaseComponent):
"""
# Extract all questions for evaluation
- filters = {"origin": [label_origin]}
+ filters: Dict = {"origin": [label_origin]}
timed_retrieve = self.timing(self.retrieve, "retrieve_time")
diff --git a/haystack/utils/deepsetcloud.py b/haystack/utils/deepsetcloud.py
index 6d875012e..4be170fa5 100644
--- a/haystack/utils/deepsetcloud.py
+++ b/haystack/utils/deepsetcloud.py
@@ -1,6 +1,6 @@
import logging
import os
-from typing import Any, Dict, Generator, List, Optional
+from typing import Any, Dict, Generator, List, Optional, Union
try:
from typing import Literal
@@ -266,7 +266,7 @@ class IndexClient:
def query(
self,
query: Optional[str] = None,
- filters: Optional[Dict[str, List[str]]] = None,
+ filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
custom_query: Optional[str] = None,
query_emb: Optional[List[float]] = None,
diff --git a/test/conftest.py b/test/conftest.py
index 314413b80..f2292876b 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -323,7 +323,7 @@ def test_docs_xs():
"meta_field": "test2",
"name": "filename2",
"date_field": "2019-10-01",
- "numeric_field": 5,
+ "numeric_field": 5.0,
},
# Document object for a doc
Document(
@@ -332,11 +332,11 @@ def test_docs_xs():
),
Document(
content="My name is Camila and I live in Madrid",
- meta={"meta_field": "test4", "name": "filename4", "date_field": "2021-02-01", "numeric_field": 3},
+ meta={"meta_field": "test4", "name": "filename4", "date_field": "2021-02-01", "numeric_field": 3.0},
),
Document(
content="My name is Matteo and I live in Rome",
- meta={"meta_field": "test5", "name": "filename5", "date_field": "2019-01-01", "numeric_field": 0},
+ meta={"meta_field": "test5", "name": "filename5", "date_field": "2019-01-01", "numeric_field": 0.0},
),
]
@@ -530,16 +530,6 @@ def document_store_with_docs(request, test_docs_xs, tmp_path):
document_store = get_document_store(
document_store_type=request.param, embedding_dim=embedding_dim.args[0], tmp_path=tmp_path
)
- # TODO: remove the following part once we allow numbers as metadatfield value in WeaviateDocumentStore
- if request.param == "weaviate":
- for doc in test_docs_xs:
- if isinstance(doc, Document):
- doc.meta["numeric_field"] = str(doc.meta["numeric_field"])
- else:
- if "meta" in doc:
- doc["meta"]["numeric_field"] = str(doc["meta"]["numeric_field"])
- else:
- doc["numeric_field"] = str(doc["numeric_field"])
document_store.write_documents(test_docs_xs)
yield document_store
document_store.delete_documents()
diff --git a/test/test_document_store.py b/test/test_document_store.py
index d76b23f19..1d817d745 100644
--- a/test/test_document_store.py
+++ b/test/test_document_store.py
@@ -216,7 +216,7 @@ def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs)
assert len(documents) == 0
-@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
+@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch", "weaviate"], indirect=True)
def test_extended_filter(document_store_with_docs):
# Test comparison operators individually
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test1"}})
@@ -235,16 +235,16 @@ def test_extended_filter(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$nin": ["test1", "test2", "n.a."]}})
assert len(documents) == 3
- documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3}})
+ documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3.0}})
assert len(documents) == 3
- documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3}})
+ documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3.0}})
assert len(documents) == 4
- documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3}})
+ documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3.0}})
assert len(documents) == 1
- documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3}})
+ documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3.0}})
assert len(documents) == 2
# Test compound filters
@@ -265,29 +265,34 @@ def test_extended_filter(document_store_with_docs):
"name": ["filename5", "filename3"],
}
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
- assert documents == documents_simplified_filter
+ # Order of returned documents might differ
+ assert len(documents) == len(documents_simplified_filter) and all(
+ doc in documents_simplified_filter for doc in documents
+ )
filters = {
"$and": {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
- "$or": {"name": {"$in": ["filename5", "filename3"]}, "numeric_field": {"$lte": 5}},
+ "$or": {"name": {"$in": ["filename5", "filename3"]}, "numeric_field": {"$lte": 5.0}},
}
}
documents = document_store_with_docs.get_all_documents(filters=filters)
assert len(documents) == 2
filters_simplified = {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
- "$or": {"name": ["filename5", "filename3"], "numeric_field": {"$lte": 5}},
+ "$or": {"name": ["filename5", "filename3"], "numeric_field": {"$lte": 5.0}},
}
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
- assert documents == documents_simplified_filter
+ assert len(documents) == len(documents_simplified_filter) and all(
+ doc in documents_simplified_filter for doc in documents
+ )
filters = {
"$and": {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {
"name": {"$in": ["filename5", "filename3"]},
- "$and": {"numeric_field": {"$lte": 5}, "$not": {"meta_field": {"$eq": "test2"}}},
+ "$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": {"$eq": "test2"}}},
},
}
}
@@ -297,11 +302,28 @@ def test_extended_filter(document_store_with_docs):
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {
"name": ["filename5", "filename3"],
- "$and": {"numeric_field": {"$lte": 5}, "$not": {"meta_field": "test2"}},
+ "$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": "test2"}},
},
}
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
- assert documents == documents_simplified_filter
+ assert len(documents) == len(documents_simplified_filter) and all(
+ doc in documents_simplified_filter for doc in documents
+ )
+
+ # Test nested logical operations within "$not", important as we apply De Morgan's laws in WeaviateDocumentstore
+ filters = {
+ "$not": {
+ "$or": {
+ "$and": {"numeric_field": {"$gt": 3.0}, "meta_field": {"$ne": "test3"}},
+ "$not": {"date_field": {"$lt": "2020-01-01"}},
+ }
+ }
+ }
+ documents = document_store_with_docs.get_all_documents(filters=filters)
+ docs_meta = [doc.meta["meta_field"] for doc in documents]
+ assert len(documents) == 2
+ assert "test3" in docs_meta
+ assert "test5" in docs_meta
# Test same logical operator twice on same level
filters = {