Support more data types and extended filters in WeaviateDocStore (#2143)

* Support more data types and extended filters in WeaviateDocStore

* Adapt types to extended filters

* Update Documentation & Code Style

* Fix mypy

* Fix type of filters

* Update Documentation & Code Style

* Add Docstrings for BaseDocStore

* Update Documentation & Code Style

* Add + prettify DocStrings

* Update Documentation & Code Style

* Fix types

* Update Documentation & Code Style

* Remove import of TypedDict

* Fix tests

* Update Documentation & Code Style

* Fix circular import

* Fix inversion of not operation + add test case

* Fix mypy

* Update Documentation & Code Style

* Apply black

* Use convert_date_to_rfc3339 instead of datetime.fromisoformat

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
bogdankostic 2022-02-18 08:55:17 +01:00 committed by GitHub
parent 1c61c1edaa
commit 2a674eaff7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 1699 additions and 318 deletions

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@
"openapi": "3.0.2",
"info": {
"title": "Haystack REST API",
"version": "1.1.0"
"version": "1.2.0rc0"
},
"paths": {
"/initialized": {

View File

@ -98,7 +98,7 @@ class BaseDocumentStore(BaseComponent):
def get_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -108,8 +108,33 @@ class BaseDocumentStore(BaseComponent):
:param index: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
:param filters: Optional filters to narrow down the documents to return.
Example: {"name": ["some", "more"], "category": ["only_one"]}
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
```
:param return_embedding: Whether to return the document embeddings.
:param batch_size: Number of documents that are passed to bulk function at a time.
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
@ -120,7 +145,7 @@ class BaseDocumentStore(BaseComponent):
def get_all_documents_generator(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -132,8 +157,33 @@ class BaseDocumentStore(BaseComponent):
:param index: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
:param filters: Optional filters to narrow down the documents to return.
Example: {"name": ["some", "more"], "category": ["only_one"]}
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
```
:param return_embedding: Whether to return the document embeddings.
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
@ -158,7 +208,7 @@ class BaseDocumentStore(BaseComponent):
def get_all_labels(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
) -> List[Label]:
pass
@ -166,7 +216,7 @@ class BaseDocumentStore(BaseComponent):
def get_all_labels_aggregated(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
open_domain: bool = True,
drop_negative_labels: bool = False,
drop_no_answers: bool = False,
@ -188,8 +238,33 @@ class BaseDocumentStore(BaseComponent):
:param index: Name of the index to get the labels from. If None, the
DocumentStore's default index (self.index) will be used.
:param filters: Optional filters to narrow down the labels to return.
Example: {"name": ["some", "more"], "category": ["only_one"]}
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
```
:param open_domain: When True, labels are aggregated purely based on the question text alone.
When False, labels are aggregated in a closed domain fashion based on the question text
and also the id of the document that the label is tied to. In this setting, this function
@ -260,7 +335,7 @@ class BaseDocumentStore(BaseComponent):
@abstractmethod
def get_document_count(
self,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
index: Optional[str] = None,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@ -299,7 +374,7 @@ class BaseDocumentStore(BaseComponent):
def query_by_embedding(
self,
query_emb: np.ndarray,
filters: Optional[Optional[Dict[str, List[str]]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@ -413,7 +488,7 @@ class BaseDocumentStore(BaseComponent):
def delete_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
pass
@ -423,7 +498,7 @@ class BaseDocumentStore(BaseComponent):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
pass
@ -433,7 +508,7 @@ class BaseDocumentStore(BaseComponent):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
pass
@ -564,7 +639,7 @@ class KeywordDocumentStore(BaseDocumentStore):
def query(
self,
query: Optional[str],
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
custom_query: Optional[str] = None,
index: Optional[str] = None,
@ -575,7 +650,70 @@ class KeywordDocumentStore(BaseDocumentStore):
that are most relevant to the query as defined by keyword matching algorithms like BM25.
:param query: The query
:param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
```
To use the same logical operator multiple times on the same level, logical operators take
optionally a list of dictionaries as value.
__Example__:
```python
filters = {
"$or": [
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
}
```
:param top_k: How many documents to return per query.
:param custom_query: Custom query to be executed.
:param index: The name of the index in the DocumentStore from which to retrieve documents

View File

@ -77,7 +77,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -87,8 +87,32 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
:param index: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
:param filters: Optional filters to narrow down the documents to return.
Example: {"name": ["some", "more"], "category": ["only_one"]}
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
```
:param return_embedding: Whether to return the document embeddings.
:param batch_size: Number of documents that are passed to bulk function at a time.
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
@ -106,7 +130,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -118,8 +142,32 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
:param index: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
:param filters: Optional filters to narrow down the documents to return.
Example: {"name": ["some", "more"], "category": ["only_one"]}
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
```
:param return_embedding: Whether to return the document embeddings.
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
@ -168,7 +216,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def get_document_count(
self,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
index: Optional[str] = None,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@ -184,7 +232,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
filters: Optional[Optional[Dict[str, List[str]]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@ -194,8 +242,69 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
:param query_emb: Embedding of the query (e.g. gathered from DPR)
:param filters: Optional filters to narrow down the search space.
Example: {"name": ["some", "more"], "category": ["only_one"]}
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
```
To use the same logical operator multiple times on the same level, logical operators take
optionally a list of dictionaries as value.
__Example__:
```python
filters = {
"$or": [
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
}
```
:param top_k: How many documents to return
:param index: Index name for storing the docs and metadata
:param return_embedding: To return document embedding
@ -220,7 +329,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def query(
self,
query: Optional[str],
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
custom_query: Optional[str] = None,
index: Optional[str] = None,
@ -231,7 +340,69 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
that are most relevant to the query as defined by the BM25 algorithm.
:param query: The query
:param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
```
To use the same logical operator multiple times on the same level, logical operators take
optionally a list of dictionaries as value.
__Example__:
```python
filters = {
"$or": [
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
}
```
:param top_k: How many documents to return per query.
:param custom_query: Custom query to be executed.
:param index: The name of the index in the DocumentStore from which to retrieve documents
@ -280,7 +451,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def get_all_labels(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
) -> List[Label]:
raise NotImplementedError("DeepsetCloudDocumentStore currently does not support labels.")
@ -299,7 +470,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
raise NotImplementedError("DeepsetCloudDocumentStore currently does not support deleting documents.")
@ -308,7 +479,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
raise NotImplementedError("DeepsetCloudDocumentStore currently does not support deleting documents.")
@ -317,7 +488,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
raise NotImplementedError("DeepsetCloudDocumentStore currently does not support labels.")

View File

@ -478,7 +478,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
self,
key: str,
query: Optional[str] = None,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
index: Optional[str] = None,
headers: Optional[Dict[str, str]] = None,
) -> List[dict]:
@ -499,7 +499,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
Example:
__Example__:
```python
filters = {
"$and": {
@ -705,7 +705,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def get_document_count(
self,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
index: Optional[str] = None,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@ -736,7 +736,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def get_embedding_count(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
) -> int:
"""
@ -756,7 +756,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -777,7 +777,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
Example:
__Example__:
```python
filters = {
"$and": {
@ -805,7 +805,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -828,7 +828,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
Example:
__Example__:
```python
filters = {
"$and": {
@ -862,7 +862,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def get_all_labels(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
batch_size: int = 10_000,
) -> List[Label]:
@ -879,7 +879,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def _get_all_documents_in_index(
self,
index: str,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
batch_size: int = 10_000,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@ -901,7 +901,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def query(
self,
query: Optional[str],
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
custom_query: Optional[str] = None,
index: Optional[str] = None,
@ -924,7 +924,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
Example:
__Example__:
```python
filters = {
"$and": {
@ -949,10 +949,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
}
```
To use the same logical operator multiple times on the same level, logical operators take
optionally a list of dictionaries as value.
To use the same logical operator multiple times on the same level, logical operators take
optionally a list of dictionaries as value.
Example:
__Example__:
```python
filters = {
"$or": [
@ -1105,7 +1105,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@ -1127,7 +1127,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
Example:
__Example__:
```python
filters = {
"$and": {
@ -1152,10 +1152,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
}
```
To use the same logical operator multiple times on the same level, logical operators take
optionally a list of dictionaries as value.
To use the same logical operator multiple times on the same level, logical operators take
optionally a list of dictionaries as value.
Example:
__Example__:
```python
filters = {
"$or": [
@ -1348,7 +1348,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
self,
retriever,
index: Optional[str] = None,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
update_existing_embeddings: bool = True,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -1374,7 +1374,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
Example:
__Example__:
```python
filters = {
"$and": {
@ -1449,7 +1449,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
"""
@ -1467,7 +1467,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
Example:
__Example__:
```python
filters = {
"$and": {
@ -1497,7 +1497,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
"""
@ -1517,7 +1517,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
Example:
__Example__:
```python
filters = {
"$and": {
@ -1532,9 +1532,9 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
}
```
If filters are provided along with a list of IDs, this method deletes the
intersection of the two query results (documents that match the filters and
have their ID in the list).
If filters are provided along with a list of IDs, this method deletes the
intersection of the two query results (documents that match the filters and
have their ID in the list).
:param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
:return: None
@ -1560,7 +1560,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
"""
@ -1580,7 +1580,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
Example:
__Example__:
```python
filters = {
"$and": {
@ -1695,7 +1695,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
filters: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@ -1717,7 +1717,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
Example:
__Example__:
```python
filters = {
"$and": {
@ -1742,10 +1742,10 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
}
```
To use the same logical operator multiple times on the same level, logical operators take
optionally a list of dictionaries as value.
To use the same logical operator multiple times on the same level, logical operators take
optionally a list of dictionaries as value.
Example:
__Example__:
```python
filters = {
"$or": [

View File

@ -1,4 +1,4 @@
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from haystack.nodes.retriever import BaseRetriever
@ -308,7 +308,7 @@ class FAISSDocumentStore(SQLDocumentStore):
retriever: "BaseRetriever",
index: Optional[str] = None,
update_existing_embeddings: bool = True,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
batch_size: int = 10_000,
):
"""
@ -379,7 +379,7 @@ class FAISSDocumentStore(SQLDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -396,7 +396,7 @@ class FAISSDocumentStore(SQLDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -447,7 +447,7 @@ class FAISSDocumentStore(SQLDocumentStore):
doc.embedding = self.faiss_indexes[index].reconstruct(int(doc.meta["vector_id"]))
return documents
def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int:
def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int:
"""
Return the count of embeddings in the document store.
"""
@ -486,7 +486,7 @@ class FAISSDocumentStore(SQLDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@ -507,7 +507,7 @@ class FAISSDocumentStore(SQLDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@ -546,7 +546,7 @@ class FAISSDocumentStore(SQLDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,

View File

@ -1,9 +1,11 @@
from typing import Union, List, Dict
from typing import Union, List, Dict, Optional, Tuple
from abc import ABC, abstractmethod
from collections import defaultdict
from haystack.document_stores.utils import convert_date_to_rfc3339
def nested_defaultdict():
def nested_defaultdict() -> defaultdict:
"""
Data structure that recursively adds a dictionary as value if a key does not exist. Advantage: In nested dictionary
structures, we don't need to check if a key already exists (which can become hard to maintain in nested dictionaries
@ -81,17 +83,17 @@ class LogicalFilterClause(ABC):
"""
def __init__(self, conditions: List["LogicalFilterClause"]):
def __init__(self, conditions: List[Union["LogicalFilterClause", "ComparisonOperation"]]):
self.conditions = conditions
@classmethod
def parse(cls, filter_term: Union[dict, List[dict]]):
def parse(cls, filter_term: Union[dict, List[dict]]) -> Union["LogicalFilterClause", "ComparisonOperation"]:
"""
Parses a filter dictionary/list and returns a LogicalFilterClause instance.
:param filter_term: Dictionary or list that contains the filter definition.
"""
conditions = []
conditions: List[Union[LogicalFilterClause, ComparisonOperation]] = []
if isinstance(filter_term, dict):
filter_term = [filter_term]
@ -122,7 +124,14 @@ class LogicalFilterClause(ABC):
"""
pass
def _merge_es_range_queries(self, conditions: List[Dict]) -> List[Dict]:
@abstractmethod
def convert_to_weaviate(self):
"""
Converts the LogicalFilterClause instance to a Weaviate filter.
"""
pass
def _merge_es_range_queries(self, conditions: List[Dict]) -> List[Dict[str, Dict]]:
"""
Merges Elasticsearch range queries that perform on the same metadata field.
"""
@ -142,14 +151,23 @@ class LogicalFilterClause(ABC):
return conditions
@abstractmethod
def invert(self) -> Union["LogicalFilterClause", "ComparisonOperation"]:
"""
Inverts the LogicalOperation instance.
Necessary for Weaviate as Weaviate doesn't seem to support the 'Not' operator anymore.
(https://github.com/semi-technologies/weaviate/issues/1717)
"""
pass
class ComparisonOperation(ABC):
def __init__(self, field_name: str, comparison_value: Union[str, float, List]):
def __init__(self, field_name: str, comparison_value: Union[str, int, float, bool, List]):
self.field_name = field_name
self.comparison_value = comparison_value
@classmethod
def parse(cls, field_name, comparison_clause: Union[Dict, List, str, float]):
def parse(cls, field_name, comparison_clause: Union[Dict, List, str, float]) -> List["ComparisonOperation"]:
comparison_operations: List[ComparisonOperation] = []
if isinstance(comparison_clause, dict):
@ -187,107 +205,273 @@ class ComparisonOperation(ABC):
"""
pass
@abstractmethod
def convert_to_weaviate(self):
"""
Converts the ComparisonOperation instance to a Weaviate comparison operator.
"""
pass
@abstractmethod
def invert(self) -> "ComparisonOperation":
"""
Inverts the ComparisonOperation.
Necessary for Weaviate as Weaviate doesn't seem to support the 'Not' operator anymore.
(https://github.com/semi-technologies/weaviate/issues/1717)
"""
pass
def _get_weaviate_datatype(
self, value: Optional[Union[str, int, float, bool]] = None
) -> Tuple[str, Union[str, int, float, bool]]:
"""
Determines the type of the comparison value and converts it to RFC3339 format if it is as date,
as Weaviate requires dates to be in RFC3339 format including the time and timezone.
"""
if value is None:
assert not isinstance(self.comparison_value, list) # Necessary for mypy
value = self.comparison_value
if isinstance(value, str):
# Check if comparison value is a date
try:
value = convert_date_to_rfc3339(value)
data_type = "valueDate"
# Comparison value is a plain string
except ValueError:
data_type = "valueString"
elif isinstance(value, int):
data_type = "valueInt"
elif isinstance(value, float):
data_type = "valueNumber"
elif isinstance(value, bool):
data_type = "valueBoolean"
else:
raise ValueError(
f"Unsupported data type of comparison value for {self.__class__.__name__}."
f"Value needs to be of type str, int, float, or bool."
)
return data_type, value
class NotOperation(LogicalFilterClause):
"""
Handles conversion of logical 'NOT' operations.
"""
def convert_to_elasticsearch(self):
def convert_to_elasticsearch(self) -> Dict[str, Dict]:
conditions = [condition.convert_to_elasticsearch() for condition in self.conditions]
conditions = self._merge_es_range_queries(conditions)
return {"bool": {"must_not": conditions}}
def convert_to_weaviate(self) -> Dict[str, Union[str, int, float, bool, List[Dict]]]:
conditions = [condition.invert().convert_to_weaviate() for condition in self.conditions]
if len(conditions) > 1:
# Conditions in self.conditions are by default combined with AND which becomes OR according to DeMorgan
return {"operator": "Or", "operands": conditions}
else:
return conditions[0]
def invert(self) -> Union[LogicalFilterClause, ComparisonOperation]:
# This method is called when a "$not" operation is embedded in another "$not" operation. Therefore, we don't
# invert the operations here, as two "$not" operation annihilate each other.
# (If we have more than one condition, we return an AndOperation, the default logical operation for combining
# multiple conditions.)
if len(self.conditions) > 1:
return AndOperation(self.conditions)
else:
return self.conditions[0]
class AndOperation(LogicalFilterClause):
"""
Handles conversion of logical 'AND' operations.
"""
def convert_to_elasticsearch(self):
def convert_to_elasticsearch(self) -> Dict[str, Dict]:
conditions = [condition.convert_to_elasticsearch() for condition in self.conditions]
conditions = self._merge_es_range_queries(conditions)
return {"bool": {"must": conditions}}
def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
conditions = [condition.convert_to_weaviate() for condition in self.conditions]
return {"operator": "And", "operands": conditions}
def invert(self) -> "OrOperation":
return OrOperation([condition.invert() for condition in self.conditions])
class OrOperation(LogicalFilterClause):
"""
Handles conversion of logical 'OR' operations.
"""
def convert_to_elasticsearch(self):
def convert_to_elasticsearch(self) -> Dict[str, Dict]:
conditions = [condition.convert_to_elasticsearch() for condition in self.conditions]
conditions = self._merge_es_range_queries(conditions)
return {"bool": {"should": conditions}}
def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
conditions = [condition.convert_to_weaviate() for condition in self.conditions]
return {"operator": "Or", "operands": conditions}
def invert(self) -> AndOperation:
return AndOperation([condition.invert() for condition in self.conditions])
class EqOperation(ComparisonOperation):
"""
Handles conversion of the '$eq' comparison operation.
"""
def convert_to_elasticsearch(self):
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Union[str, int, float, bool]]]:
assert not isinstance(self.comparison_value, list), "Use '$in' operation for lists as comparison values."
return {"term": {self.field_name: self.comparison_value}}
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, bool]]:
comp_value_type, comp_value = self._get_weaviate_datatype()
return {"path": [self.field_name], "operator": "Equal", comp_value_type: comp_value}
def invert(self) -> "NeOperation":
return NeOperation(self.field_name, self.comparison_value)
class InOperation(ComparisonOperation):
"""
Handles conversion of the '$in' comparison operation.
"""
def convert_to_elasticsearch(self):
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, List]]:
assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list."
return {"terms": {self.field_name: self.comparison_value}}
def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
filter_dict: Dict[str, Union[str, List[Dict]]] = {"operator": "Or", "operands": []}
assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list."
for value in self.comparison_value:
comp_value_type, comp_value = self._get_weaviate_datatype(value)
assert isinstance(filter_dict["operands"], list) # Necessary for mypy
filter_dict["operands"].append(
{"path": [self.field_name], "operator": "Equal", comp_value_type: comp_value}
)
return filter_dict
def invert(self) -> "NinOperation":
return NinOperation(self.field_name, self.comparison_value)
class NeOperation(ComparisonOperation):
"""
Handles conversion of the '$ne' comparison operation.
"""
def convert_to_elasticsearch(self):
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Dict[str, Union[str, int, float, bool]]]]]:
assert not isinstance(self.comparison_value, list), "Use '$nin' operation for lists as comparison values."
return {"bool": {"must_not": {"term": {self.field_name: self.comparison_value}}}}
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, bool]]:
comp_value_type, comp_value = self._get_weaviate_datatype()
return {"path": [self.field_name], "operator": "NotEqual", comp_value_type: comp_value}
def invert(self) -> "EqOperation":
return EqOperation(self.field_name, self.comparison_value)
class NinOperation(ComparisonOperation):
"""
Handles conversion of the '$nin' comparison operation.
"""
def convert_to_elasticsearch(self):
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Dict[str, List]]]]:
assert isinstance(self.comparison_value, list), "'$nin' operation requires comparison value to be a list."
return {"bool": {"must_not": {"terms": {self.field_name: self.comparison_value}}}}
def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
filter_dict: Dict[str, Union[str, List[Dict]]] = {"operator": "And", "operands": []}
assert isinstance(self.comparison_value, list), "'$nin' operation requires comparison value to be a list."
for value in self.comparison_value:
comp_value_type, comp_value = self._get_weaviate_datatype(value)
assert isinstance(filter_dict["operands"], list) # Necessary for mypy
filter_dict["operands"].append(
{"path": [self.field_name], "operator": "NotEqual", comp_value_type: comp_value}
)
return filter_dict
def invert(self) -> "InOperation":
return InOperation(self.field_name, self.comparison_value)
class GtOperation(ComparisonOperation):
"""
Handles conversion of the '$gt' comparison operation.
"""
def convert_to_elasticsearch(self):
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
assert not isinstance(self.comparison_value, list), "Comparison value for '$gt' operation must not be a list."
return {"range": {self.field_name: {"gt": self.comparison_value}}}
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
comp_value_type, comp_value = self._get_weaviate_datatype()
assert not isinstance(comp_value, list), "Comparison value for '$gt' operation must not be a list."
return {"path": [self.field_name], "operator": "GreaterThan", comp_value_type: comp_value}
def invert(self) -> "LteOperation":
return LteOperation(self.field_name, self.comparison_value)
class GteOperation(ComparisonOperation):
"""
Handles conversion of the '$gte' comparison operation.
"""
def convert_to_elasticsearch(self):
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
assert not isinstance(self.comparison_value, list), "Comparison value for '$gte' operation must not be a list."
return {"range": {self.field_name: {"gte": self.comparison_value}}}
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
comp_value_type, comp_value = self._get_weaviate_datatype()
assert not isinstance(comp_value, list), "Comparison value for '$gte' operation must not be a list."
return {"path": [self.field_name], "operator": "GreaterThanEqual", comp_value_type: comp_value}
def invert(self) -> "LtOperation":
return LtOperation(self.field_name, self.comparison_value)
class LtOperation(ComparisonOperation):
"""
Handles conversion of the '$lt' comparison operation.
"""
def convert_to_elasticsearch(self):
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
assert not isinstance(self.comparison_value, list), "Comparison value for '$lt' operation must not be a list."
return {"range": {self.field_name: {"lt": self.comparison_value}}}
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
comp_value_type, comp_value = self._get_weaviate_datatype()
assert not isinstance(comp_value, list), "Comparison value for '$lt' operation must not be a list."
return {"path": [self.field_name], "operator": "LessThan", comp_value_type: comp_value}
def invert(self) -> "GteOperation":
return GteOperation(self.field_name, self.comparison_value)
class LteOperation(ComparisonOperation):
"""
Handles conversion of the '$lte' comparison operation.
"""
def convert_to_elasticsearch(self):
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
assert not isinstance(self.comparison_value, list), "Comparison value for '$lte' operation must not be a list."
return {"range": {self.field_name: {"lte": self.comparison_value}}}
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
comp_value_type, comp_value = self._get_weaviate_datatype()
assert not isinstance(comp_value, list), "Comparison value for '$lte' operation must not be a list."
return {"path": [self.field_name], "operator": "LessThanEqual", comp_value_type: comp_value}
def invert(self) -> "GtOperation":
return GtOperation(self.field_name, self.comparison_value)

View File

@ -1,4 +1,4 @@
from typing import TYPE_CHECKING, Dict, List, Optional, Union, Generator
from typing import TYPE_CHECKING, Dict, List, Optional, Union, Generator, Any
if TYPE_CHECKING:
from haystack.nodes.retriever import BaseRetriever
@ -291,7 +291,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@ -337,7 +337,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
self,
retriever: "BaseRetriever",
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
update_existing_embeddings: bool = True,
batch_size: int = 10_000,
):
@ -390,7 +390,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def get_document_count(
self,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
index: Optional[str] = None,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@ -427,7 +427,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def _query(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
return_embedding: Optional[bool] = None,
only_documents_without_embedding: bool = False,
):
@ -464,7 +464,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -490,7 +490,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -514,7 +514,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def get_all_labels(
self,
index: str = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
headers: Optional[Dict[str, str]] = None,
) -> List[Label]:
"""
@ -544,7 +544,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@ -569,7 +569,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@ -603,7 +603,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
headers: Optional[Dict[str, str]] = None,
):
"""

View File

@ -307,7 +307,7 @@ class MilvusDocumentStore(SQLDocumentStore):
index: Optional[str] = None,
batch_size: int = 10_000,
update_existing_embeddings: bool = True,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
):
"""
Updates the embeddings in the the document store using the encoding model specified in the retriever.
@ -374,7 +374,7 @@ class MilvusDocumentStore(SQLDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
filters: Optional[dict] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@ -440,7 +440,7 @@ class MilvusDocumentStore(SQLDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@ -465,7 +465,7 @@ class MilvusDocumentStore(SQLDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@ -508,7 +508,7 @@ class MilvusDocumentStore(SQLDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -541,7 +541,7 @@ class MilvusDocumentStore(SQLDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -676,7 +676,11 @@ class MilvusDocumentStore(SQLDocumentStore):
return vectors
def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int:
def get_embedding_count(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
) -> int:
"""
Return the count of embeddings in the document store.
"""

View File

@ -375,7 +375,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
index: Optional[str] = None,
batch_size: int = 10_000,
update_existing_embeddings: bool = True,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
):
"""
Updates the embeddings in the the document store using the encoding model specified in the retriever.
@ -457,7 +457,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
filters: Optional[dict] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@ -538,7 +538,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@ -571,7 +571,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -604,7 +604,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,

View File

@ -215,7 +215,7 @@ class SQLDocumentStore(BaseDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -233,7 +233,7 @@ class SQLDocumentStore(BaseDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -271,7 +271,7 @@ class SQLDocumentStore(BaseDocumentStore):
def _query(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
vector_ids: Optional[List[str]] = None,
only_documents_without_embedding: bool = False,
batch_size: int = 10_000,
@ -521,7 +521,7 @@ class SQLDocumentStore(BaseDocumentStore):
def get_document_count(
self,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
index: Optional[str] = None,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@ -609,7 +609,7 @@ class SQLDocumentStore(BaseDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@ -634,7 +634,7 @@ class SQLDocumentStore(BaseDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
headers: Optional[Dict[str, str]] = None,
):
"""
@ -674,7 +674,7 @@ class SQLDocumentStore(BaseDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
headers: Optional[Dict[str, str]] = None,
):
"""

View File

@ -2,6 +2,7 @@ from typing import Dict, List, Optional, Tuple, Union, Generator
import json
import logging
from datetime import datetime
from haystack.schema import Document, Label, Answer, Span
from haystack.nodes.preprocessor import PreProcessor
@ -250,3 +251,23 @@ def _extract_docs_and_labels_from_dict(
labels.append(label)
return docs, labels, problematic_ids
def convert_date_to_rfc3339(date: str) -> str:
"""
Converts a date to RFC3339 format, as Weaviate requires dates to be in RFC3339 format including the time and
timezone.
If the provided date string does not contain a time and/or timezone, we use 00:00 as default time
and UTC as default time zone.
This method cannot be part of WeaviateDocumentStore, as this would result in a circular import between weaviate.py
and filter_utils.py.
"""
parsed_datetime = datetime.fromisoformat(date)
if parsed_datetime.utcoffset() is None:
converted_date = parsed_datetime.isoformat() + "Z"
else:
converted_date = parsed_datetime.isoformat()
return converted_date

View File

@ -2,6 +2,7 @@ import hashlib
import re
import uuid
from typing import Dict, Generator, List, Optional, Union
from datetime import datetime
import logging
import json
@ -11,6 +12,8 @@ from tqdm import tqdm
from haystack.schema import Document
from haystack.document_stores import BaseDocumentStore
from haystack.document_stores.base import get_batches_from_generator
from haystack.document_stores.filter_utils import LogicalFilterClause
from haystack.document_stores.utils import convert_date_to_rfc3339
try:
from weaviate import client, AuthClientPassword
@ -225,8 +228,8 @@ class WeaviateDocumentStore(BaseDocumentStore):
content = json.loads(str(props.get(self.content_field)))
content_type = None
if props.get("contenttype") is not None:
content_type = str(props.pop("contenttype"))
if props.get("content_type") is not None:
content_type = str(props.pop("content_type"))
# Weaviate creates "_additional" key for semantic search
if "_additional" in props:
@ -337,30 +340,61 @@ class WeaviateDocumentStore(BaseDocumentStore):
return cur_properties
def _build_filter_clause(self, filters: Dict[str, List[str]]) -> dict:
def _get_date_properties(self, index: Optional[str] = None) -> List[str]:
"""
Transform Haystack filter conditions to Weaviate where filter clauses.
Get all existing properties of type 'date' in the schema.
"""
weaviate_filters = []
weaviate_filter = {}
for key, values in filters.items():
for value in values:
weaviate_filter = {"path": [key], "operator": "Equal", "valueString": value}
weaviate_filters.append(weaviate_filter)
if len(weaviate_filters) > 1:
filter_dict = {"operator": "Or", "operands": weaviate_filters}
return filter_dict
else:
return weaviate_filter
index = self._sanitize_index_name(index) or self.index
cur_properties = []
for class_item in self.weaviate_client.schema.get()["classes"]:
if class_item["class"] == index:
cur_properties = [item["name"] for item in class_item["properties"] if item["dataType"][0] == "date"]
def _update_schema(self, new_prop: str, index: Optional[str] = None):
return cur_properties
def _update_schema(
self, new_prop: str, property_value: Union[List, str, int, float, bool], index: Optional[str] = None
):
"""
Updates the schema with a new property.
"""
index = self._sanitize_index_name(index) or self.index
property_dict = {"dataType": ["string"], "description": f"dynamic property {new_prop}", "name": new_prop}
data_type = self._get_weaviate_type_of_value(property_value)
property_dict = {"dataType": [data_type], "description": f"dynamic property {new_prop}", "name": new_prop}
self.weaviate_client.schema.property.create(index, property_dict)
@staticmethod
def _get_weaviate_type_of_value(value: Union[List, str, int, float, bool]) -> str:
"""
Infers corresponding Weaviate data type for a value.
"""
data_type = ""
list_of_values = False
if isinstance(value, list):
list_of_values = True
value = value[0]
if isinstance(value, str):
# If the value is parsable by datetime, it is a date
try:
convert_date_to_rfc3339(value)
data_type = "date"
# Otherwise, the value is a string
except ValueError:
data_type = "string"
elif isinstance(value, int):
data_type = "int"
elif isinstance(value, float):
data_type = "number"
elif isinstance(value, bool):
data_type = "boolean"
if list_of_values:
data_type += "[]"
return data_type
def _check_document(self, cur_props: List[str], doc: dict) -> List[str]:
"""
Find the properties in the document that don't exist in the existing schema.
@ -458,9 +492,6 @@ class WeaviateDocumentStore(BaseDocumentStore):
if self.similarity == "cosine":
self.normalize_embedding(vector)
# rename as weaviate doesn't like "_" in field names
_doc["contenttype"] = _doc.pop("content_type")
# Converting content to JSON-string as Weaviate doesn't allow other nested list for tables
_doc["content"] = json.dumps(_doc["content"])
@ -469,9 +500,14 @@ class WeaviateDocumentStore(BaseDocumentStore):
missing_props = self._check_document(current_properties, _doc)
if missing_props:
for property in missing_props:
self._update_schema(property, index)
self._update_schema(property, _doc[property], index)
current_properties.append(property)
# Weaviate requires dates to be in RFC3339 format
date_fields = self._get_date_properties(index)
for date_field in date_fields:
_doc[date_field] = convert_date_to_rfc3339(_doc[date_field])
docs_batch.add(_doc, class_name=index, uuid=doc_id, vector=vector)
# Ingest a batch of documents
@ -489,23 +525,43 @@ class WeaviateDocumentStore(BaseDocumentStore):
progress_bar.update(batch_size)
progress_bar.close()
def update_document_meta(self, id: str, meta: Dict[str, str], index: str = None):
def update_document_meta(self, id: str, meta: Dict[str, Union[List, str, int, float, bool]], index: str = None):
"""
Update the metadata dictionary of a document by specifying its string id.
Overwrites only the specified fields, the unspecified ones remain unchanged.
"""
if not index:
index = self.index
current_properties = self._get_current_properties(index)
# Check if the new metadata contains additional properties and append them to the schema
missing_props = self._check_document(current_properties, meta)
if missing_props:
for property in missing_props:
self._update_schema(property, meta[property], index)
current_properties.append(property)
# Weaviate requires dates to be in RFC3339 format
date_fields = self._get_date_properties(index)
for date_field in date_fields:
if isinstance(meta[date_field], str):
meta[date_field] = convert_date_to_rfc3339(str(meta[date_field]))
self.weaviate_client.data_object.update(meta, class_name=index, uuid=id)
def get_embedding_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
def get_embedding_count(
self, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None
) -> int:
"""
Return the number of embeddings in the document store, which is the same as the number of documents since every document has a default embedding
Return the number of embeddings in the document store, which is the same as the number of documents since
every document has a default embedding.
"""
return self.get_document_count(filters=filters, index=index)
def get_document_count(
self,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
index: Optional[str] = None,
only_documents_without_embedding: bool = False,
headers: Optional[Dict[str, str]] = None,
@ -522,7 +578,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
index = self._sanitize_index_name(index) or self.index
doc_count = 0
if filters:
filter_dict = self._build_filter_clause(filters=filters)
filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
result = (
self.weaviate_client.query.aggregate(index).with_fields("meta { count }").with_where(filter_dict).do()
)
@ -538,7 +594,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
def get_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -548,8 +604,32 @@ class WeaviateDocumentStore(BaseDocumentStore):
:param index: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
:param filters: Optional filters to narrow down the documents to return.
Example: {"name": ["some", "more"], "category": ["only_one"]}
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
```
:param return_embedding: Whether to return the document embeddings.
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
"""
@ -566,7 +646,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
def _get_all_documents_in_index(
self,
index: Optional[str],
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
batch_size: int = 10_000,
only_documents_without_embedding: bool = False,
) -> Generator[dict, None, None]:
@ -580,7 +660,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
properties.append("_additional {id, certainty, vector}")
if filters:
filter_dict = self._build_filter_clause(filters=filters)
filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
result = (
self.weaviate_client.query.get(class_name=index, properties=properties).with_where(filter_dict).do()
)
@ -597,7 +677,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
def get_all_documents_generator(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
return_embedding: Optional[bool] = None,
batch_size: int = 10_000,
headers: Optional[Dict[str, str]] = None,
@ -609,8 +689,32 @@ class WeaviateDocumentStore(BaseDocumentStore):
:param index: Name of the index to get the documents from. If None, the
DocumentStore's default index (self.index) will be used.
:param filters: Optional filters to narrow down the documents to return.
Example: {"name": ["some", "more"], "category": ["only_one"]}
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
```
:param return_embedding: Whether to return the document embeddings.
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
"""
@ -630,7 +734,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
def query(
self,
query: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
custom_query: Optional[str] = None,
index: Optional[str] = None,
@ -640,7 +744,69 @@ class WeaviateDocumentStore(BaseDocumentStore):
that are most relevant to the query as defined by Weaviate semantic search.
:param query: The query
:param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
```
To use the same logical operator multiple times on the same level, logical operators take
optionally a list of dictionaries as value.
__Example__:
```python
filters = {
"$or": [
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
}
```
:param top_k: How many documents to return per query.
:param custom_query: Custom query that will executed using query.raw method, for more details refer
https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html
@ -655,7 +821,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
if custom_query:
query_output = self.weaviate_client.query.raw(custom_query)
elif filters:
filter_dict = self._build_filter_clause(filters)
filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
query_output = (
self.weaviate_client.query.get(class_name=index, properties=properties)
.with_where(filter_dict)
@ -684,7 +850,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
def query_by_embedding(
self,
query_emb: np.ndarray,
filters: Optional[dict] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
index: Optional[str] = None,
return_embedding: Optional[bool] = None,
@ -694,8 +860,69 @@ class WeaviateDocumentStore(BaseDocumentStore):
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
:param query_emb: Embedding of the query (e.g. gathered from DPR)
:param filters: Optional filters to narrow down the search space.
Example: {"name": ["some", "more"], "category": ["only_one"]}
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
```
To use the same logical operator multiple times on the same level, logical operators take
optionally a list of dictionaries as value.
__Example__:
```python
filters = {
"$or": [
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
}
```
:param top_k: How many documents to return
:param index: index name for storing the docs and metadata
:param return_embedding: To return document embedding
@ -719,7 +946,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
query_string = {"vector": query_emb}
if filters:
filter_dict = self._build_filter_clause(filters)
filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
query_output = (
self.weaviate_client.query.get(class_name=index, properties=properties)
.with_where(filter_dict)
@ -751,7 +978,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
self,
retriever,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
update_existing_embeddings: bool = True,
batch_size: int = 10_000,
):
@ -763,8 +990,32 @@ class WeaviateDocumentStore(BaseDocumentStore):
:param index: Index name to update
:param update_existing_embeddings: Weaviate mandates an embedding while creating the document itself.
This option must be always true for weaviate and it will update the embeddings for all the documents.
:param filters: Optional filters to narrow down the documents for which embeddings are to be updated.
Example: {"name": ["some", "more"], "category": ["only_one"]}
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
```
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
:return: None
"""
@ -808,13 +1059,38 @@ class WeaviateDocumentStore(BaseDocumentStore):
def delete_all_documents(
self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
"""
Delete documents in an index. All documents are deleted if no filters are passed.
:param index: Index name to delete the document from.
:param filters: Optional filters to narrow down the documents to be deleted.
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
```
:return: None
"""
if headers:
@ -832,7 +1108,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
self,
index: Optional[str] = None,
ids: Optional[List[str]] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
headers: Optional[Dict[str, str]] = None,
):
"""
@ -841,11 +1117,35 @@ class WeaviateDocumentStore(BaseDocumentStore):
:param index: Index name to delete the document from. If None, the
DocumentStore's default index (self.index) will be used.
:param ids: Optional list of IDs to narrow down the documents to be deleted.
:param filters: Optional filters to narrow down the documents to be deleted.
Example filters: {"name": ["some", "more"], "category": ["only_one"]}.
If filters are provided along with a list of IDs, this method deletes the
intersection of the two query results (documents that match the filters and
have their ID in the list).
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
conditions.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as
value. Metadata field names take a dictionary of comparison operators as value. Comparison
operator keys take a single value or (in case of `"$in"`) a list of values as value.
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
operation.
__Example__:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
```
If filters are provided along with a list of IDs, this method deletes the
intersection of the two query results (documents that match the filters and
have their ID in the list).
:return: None
"""
if headers:

View File

@ -873,7 +873,7 @@ class FARMReader(BaseReader):
)
# extract all questions for evaluation
filters = {"origin": [label_origin]}
filters: Dict = {"origin": [label_origin]}
labels = document_store.get_all_labels(index=label_index, filters=filters)

View File

@ -124,7 +124,7 @@ class BaseRetriever(BaseComponent):
"""
# Extract all questions for evaluation
filters = {"origin": [label_origin]}
filters: Dict = {"origin": [label_origin]}
timed_retrieve = self.timing(self.retrieve, "retrieve_time")

View File

@ -1,6 +1,6 @@
import logging
import os
from typing import Any, Dict, Generator, List, Optional
from typing import Any, Dict, Generator, List, Optional, Union
try:
from typing import Literal
@ -266,7 +266,7 @@ class IndexClient:
def query(
self,
query: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None,
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
top_k: int = 10,
custom_query: Optional[str] = None,
query_emb: Optional[List[float]] = None,

View File

@ -323,7 +323,7 @@ def test_docs_xs():
"meta_field": "test2",
"name": "filename2",
"date_field": "2019-10-01",
"numeric_field": 5,
"numeric_field": 5.0,
},
# Document object for a doc
Document(
@ -332,11 +332,11 @@ def test_docs_xs():
),
Document(
content="My name is Camila and I live in Madrid",
meta={"meta_field": "test4", "name": "filename4", "date_field": "2021-02-01", "numeric_field": 3},
meta={"meta_field": "test4", "name": "filename4", "date_field": "2021-02-01", "numeric_field": 3.0},
),
Document(
content="My name is Matteo and I live in Rome",
meta={"meta_field": "test5", "name": "filename5", "date_field": "2019-01-01", "numeric_field": 0},
meta={"meta_field": "test5", "name": "filename5", "date_field": "2019-01-01", "numeric_field": 0.0},
),
]
@ -530,16 +530,6 @@ def document_store_with_docs(request, test_docs_xs, tmp_path):
document_store = get_document_store(
document_store_type=request.param, embedding_dim=embedding_dim.args[0], tmp_path=tmp_path
)
# TODO: remove the following part once we allow numbers as metadatfield value in WeaviateDocumentStore
if request.param == "weaviate":
for doc in test_docs_xs:
if isinstance(doc, Document):
doc.meta["numeric_field"] = str(doc.meta["numeric_field"])
else:
if "meta" in doc:
doc["meta"]["numeric_field"] = str(doc["meta"]["numeric_field"])
else:
doc["numeric_field"] = str(doc["numeric_field"])
document_store.write_documents(test_docs_xs)
yield document_store
document_store.delete_documents()

View File

@ -216,7 +216,7 @@ def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs)
assert len(documents) == 0
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch", "weaviate"], indirect=True)
def test_extended_filter(document_store_with_docs):
# Test comparison operators individually
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test1"}})
@ -235,16 +235,16 @@ def test_extended_filter(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$nin": ["test1", "test2", "n.a."]}})
assert len(documents) == 3
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3}})
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3.0}})
assert len(documents) == 3
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3}})
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3.0}})
assert len(documents) == 4
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3}})
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3.0}})
assert len(documents) == 1
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3}})
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3.0}})
assert len(documents) == 2
# Test compound filters
@ -265,29 +265,34 @@ def test_extended_filter(document_store_with_docs):
"name": ["filename5", "filename3"],
}
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
assert documents == documents_simplified_filter
# Order of returned documents might differ
assert len(documents) == len(documents_simplified_filter) and all(
doc in documents_simplified_filter for doc in documents
)
filters = {
"$and": {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {"name": {"$in": ["filename5", "filename3"]}, "numeric_field": {"$lte": 5}},
"$or": {"name": {"$in": ["filename5", "filename3"]}, "numeric_field": {"$lte": 5.0}},
}
}
documents = document_store_with_docs.get_all_documents(filters=filters)
assert len(documents) == 2
filters_simplified = {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {"name": ["filename5", "filename3"], "numeric_field": {"$lte": 5}},
"$or": {"name": ["filename5", "filename3"], "numeric_field": {"$lte": 5.0}},
}
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
assert documents == documents_simplified_filter
assert len(documents) == len(documents_simplified_filter) and all(
doc in documents_simplified_filter for doc in documents
)
filters = {
"$and": {
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {
"name": {"$in": ["filename5", "filename3"]},
"$and": {"numeric_field": {"$lte": 5}, "$not": {"meta_field": {"$eq": "test2"}}},
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": {"$eq": "test2"}}},
},
}
}
@ -297,11 +302,28 @@ def test_extended_filter(document_store_with_docs):
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
"$or": {
"name": ["filename5", "filename3"],
"$and": {"numeric_field": {"$lte": 5}, "$not": {"meta_field": "test2"}},
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": "test2"}},
},
}
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
assert documents == documents_simplified_filter
assert len(documents) == len(documents_simplified_filter) and all(
doc in documents_simplified_filter for doc in documents
)
# Test nested logical operations within "$not", important as we apply De Morgan's laws in WeaviateDocumentstore
filters = {
"$not": {
"$or": {
"$and": {"numeric_field": {"$gt": 3.0}, "meta_field": {"$ne": "test3"}},
"$not": {"date_field": {"$lt": "2020-01-01"}},
}
}
}
documents = document_store_with_docs.get_all_documents(filters=filters)
docs_meta = [doc.meta["meta_field"] for doc in documents]
assert len(documents) == 2
assert "test3" in docs_meta
assert "test5" in docs_meta
# Test same logical operator twice on same level
filters = {