mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-04 19:17:26 +00:00
Support more data types and extended filters in WeaviateDocStore (#2143)
* Support more data types and extended filters in WeaviateDocStore * Adapt types to extended filters * Update Documentation & Code Style * Fix mypy * Fix type of filters * Update Documentation & Code Style * Add Docstrings for BaseDocStore * Update Documentation & Code Style * Add + prettify DocStrings * Update Documentation & Code Style * Fix types * Update Documentation & Code Style * Remove import of TypedDict * Fix tests * Update Documentation & Code Style * Fix circular import * Fix inversion of not operation + add test case * Fix mypy * Update Documentation & Code Style * Apply black * Use convert_date_to_rfc3339 instead of datetime.fromisoformat Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
1c61c1edaa
commit
2a674eaff7
File diff suppressed because it is too large
Load Diff
@ -2,7 +2,7 @@
|
||||
"openapi": "3.0.2",
|
||||
"info": {
|
||||
"title": "Haystack REST API",
|
||||
"version": "1.1.0"
|
||||
"version": "1.2.0rc0"
|
||||
},
|
||||
"paths": {
|
||||
"/initialized": {
|
||||
|
||||
@ -98,7 +98,7 @@ class BaseDocumentStore(BaseComponent):
|
||||
def get_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -108,8 +108,33 @@ class BaseDocumentStore(BaseComponent):
|
||||
|
||||
:param index: Name of the index to get the documents from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the documents to return.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
:param return_embedding: Whether to return the document embeddings.
|
||||
:param batch_size: Number of documents that are passed to bulk function at a time.
|
||||
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
|
||||
@ -120,7 +145,7 @@ class BaseDocumentStore(BaseComponent):
|
||||
def get_all_documents_generator(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -132,8 +157,33 @@ class BaseDocumentStore(BaseComponent):
|
||||
|
||||
:param index: Name of the index to get the documents from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the documents to return.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
:param return_embedding: Whether to return the document embeddings.
|
||||
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
|
||||
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
|
||||
@ -158,7 +208,7 @@ class BaseDocumentStore(BaseComponent):
|
||||
def get_all_labels(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
) -> List[Label]:
|
||||
pass
|
||||
@ -166,7 +216,7 @@ class BaseDocumentStore(BaseComponent):
|
||||
def get_all_labels_aggregated(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
open_domain: bool = True,
|
||||
drop_negative_labels: bool = False,
|
||||
drop_no_answers: bool = False,
|
||||
@ -188,8 +238,33 @@ class BaseDocumentStore(BaseComponent):
|
||||
|
||||
:param index: Name of the index to get the labels from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the labels to return.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
:param open_domain: When True, labels are aggregated purely based on the question text alone.
|
||||
When False, labels are aggregated in a closed domain fashion based on the question text
|
||||
and also the id of the document that the label is tied to. In this setting, this function
|
||||
@ -260,7 +335,7 @@ class BaseDocumentStore(BaseComponent):
|
||||
@abstractmethod
|
||||
def get_document_count(
|
||||
self,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
index: Optional[str] = None,
|
||||
only_documents_without_embedding: bool = False,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -299,7 +374,7 @@ class BaseDocumentStore(BaseComponent):
|
||||
def query_by_embedding(
|
||||
self,
|
||||
query_emb: np.ndarray,
|
||||
filters: Optional[Optional[Dict[str, List[str]]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
@ -413,7 +488,7 @@ class BaseDocumentStore(BaseComponent):
|
||||
def delete_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
pass
|
||||
@ -423,7 +498,7 @@ class BaseDocumentStore(BaseComponent):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
pass
|
||||
@ -433,7 +508,7 @@ class BaseDocumentStore(BaseComponent):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
pass
|
||||
@ -564,7 +639,7 @@ class KeywordDocumentStore(BaseDocumentStore):
|
||||
def query(
|
||||
self,
|
||||
query: Optional[str],
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
top_k: int = 10,
|
||||
custom_query: Optional[str] = None,
|
||||
index: Optional[str] = None,
|
||||
@ -575,7 +650,70 @@ class KeywordDocumentStore(BaseDocumentStore):
|
||||
that are most relevant to the query as defined by keyword matching algorithms like BM25.
|
||||
|
||||
:param query: The query
|
||||
:param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
# or simpler using default operators
|
||||
filters = {
|
||||
"type": "article",
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": ["economy", "politics"],
|
||||
"publisher": "nytimes"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
To use the same logical operator multiple times on the same level, logical operators take
|
||||
optionally a list of dictionaries as value.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$or": [
|
||||
{
|
||||
"$and": {
|
||||
"Type": "News Paper",
|
||||
"Date": {
|
||||
"$lt": "2019-01-01"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$and": {
|
||||
"Type": "Blog Post",
|
||||
"Date": {
|
||||
"$gte": "2019-01-01"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
:param top_k: How many documents to return per query.
|
||||
:param custom_query: Custom query to be executed.
|
||||
:param index: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
@ -77,7 +77,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
def get_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -87,8 +87,32 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
|
||||
:param index: Name of the index to get the documents from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the documents to return.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
:param return_embedding: Whether to return the document embeddings.
|
||||
:param batch_size: Number of documents that are passed to bulk function at a time.
|
||||
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
|
||||
@ -106,7 +130,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
def get_all_documents_generator(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -118,8 +142,32 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
|
||||
:param index: Name of the index to get the documents from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the documents to return.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
:param return_embedding: Whether to return the document embeddings.
|
||||
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
|
||||
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
|
||||
@ -168,7 +216,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
|
||||
def get_document_count(
|
||||
self,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
index: Optional[str] = None,
|
||||
only_documents_without_embedding: bool = False,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -184,7 +232,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
def query_by_embedding(
|
||||
self,
|
||||
query_emb: np.ndarray,
|
||||
filters: Optional[Optional[Dict[str, List[str]]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
@ -194,8 +242,69 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
|
||||
|
||||
:param query_emb: Embedding of the query (e.g. gathered from DPR)
|
||||
:param filters: Optional filters to narrow down the search space.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
# or simpler using default operators
|
||||
filters = {
|
||||
"type": "article",
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": ["economy", "politics"],
|
||||
"publisher": "nytimes"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
To use the same logical operator multiple times on the same level, logical operators take
|
||||
optionally a list of dictionaries as value.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$or": [
|
||||
{
|
||||
"$and": {
|
||||
"Type": "News Paper",
|
||||
"Date": {
|
||||
"$lt": "2019-01-01"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$and": {
|
||||
"Type": "Blog Post",
|
||||
"Date": {
|
||||
"$gte": "2019-01-01"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
:param top_k: How many documents to return
|
||||
:param index: Index name for storing the docs and metadata
|
||||
:param return_embedding: To return document embedding
|
||||
@ -220,7 +329,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
def query(
|
||||
self,
|
||||
query: Optional[str],
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
top_k: int = 10,
|
||||
custom_query: Optional[str] = None,
|
||||
index: Optional[str] = None,
|
||||
@ -231,7 +340,69 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
that are most relevant to the query as defined by the BM25 algorithm.
|
||||
|
||||
:param query: The query
|
||||
:param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
# or simpler using default operators
|
||||
filters = {
|
||||
"type": "article",
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": ["economy", "politics"],
|
||||
"publisher": "nytimes"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
To use the same logical operator multiple times on the same level, logical operators take
|
||||
optionally a list of dictionaries as value.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$or": [
|
||||
{
|
||||
"$and": {
|
||||
"Type": "News Paper",
|
||||
"Date": {
|
||||
"$lt": "2019-01-01"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$and": {
|
||||
"Type": "Blog Post",
|
||||
"Date": {
|
||||
"$gte": "2019-01-01"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
:param top_k: How many documents to return per query.
|
||||
:param custom_query: Custom query to be executed.
|
||||
:param index: The name of the index in the DocumentStore from which to retrieve documents
|
||||
@ -280,7 +451,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
def get_all_labels(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
) -> List[Label]:
|
||||
raise NotImplementedError("DeepsetCloudDocumentStore currently does not support labels.")
|
||||
@ -299,7 +470,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
def delete_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
raise NotImplementedError("DeepsetCloudDocumentStore currently does not support deleting documents.")
|
||||
@ -308,7 +479,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
raise NotImplementedError("DeepsetCloudDocumentStore currently does not support deleting documents.")
|
||||
@ -317,7 +488,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
raise NotImplementedError("DeepsetCloudDocumentStore currently does not support labels.")
|
||||
|
||||
@ -478,7 +478,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
self,
|
||||
key: str,
|
||||
query: Optional[str] = None,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
index: Optional[str] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
) -> List[dict]:
|
||||
@ -499,7 +499,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
Example:
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
@ -705,7 +705,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
|
||||
def get_document_count(
|
||||
self,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
index: Optional[str] = None,
|
||||
only_documents_without_embedding: bool = False,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -736,7 +736,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
def get_embedding_count(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
) -> int:
|
||||
"""
|
||||
@ -756,7 +756,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
def get_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -777,7 +777,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
Example:
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
@ -805,7 +805,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
def get_all_documents_generator(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -828,7 +828,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
Example:
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
@ -862,7 +862,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
def get_all_labels(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
batch_size: int = 10_000,
|
||||
) -> List[Label]:
|
||||
@ -879,7 +879,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
def _get_all_documents_in_index(
|
||||
self,
|
||||
index: str,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
batch_size: int = 10_000,
|
||||
only_documents_without_embedding: bool = False,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -901,7 +901,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
def query(
|
||||
self,
|
||||
query: Optional[str],
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
top_k: int = 10,
|
||||
custom_query: Optional[str] = None,
|
||||
index: Optional[str] = None,
|
||||
@ -924,7 +924,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
Example:
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
@ -949,10 +949,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
}
|
||||
```
|
||||
|
||||
To use the same logical operator multiple times on the same level, logical operators take
|
||||
optionally a list of dictionaries as value.
|
||||
To use the same logical operator multiple times on the same level, logical operators take
|
||||
optionally a list of dictionaries as value.
|
||||
|
||||
Example:
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$or": [
|
||||
@ -1105,7 +1105,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
def query_by_embedding(
|
||||
self,
|
||||
query_emb: np.ndarray,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
@ -1127,7 +1127,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
Example:
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
@ -1152,10 +1152,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
}
|
||||
```
|
||||
|
||||
To use the same logical operator multiple times on the same level, logical operators take
|
||||
optionally a list of dictionaries as value.
|
||||
To use the same logical operator multiple times on the same level, logical operators take
|
||||
optionally a list of dictionaries as value.
|
||||
|
||||
Example:
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$or": [
|
||||
@ -1348,7 +1348,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
self,
|
||||
retriever,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
update_existing_embeddings: bool = True,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -1374,7 +1374,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
Example:
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
@ -1449,7 +1449,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
def delete_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -1467,7 +1467,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
Example:
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
@ -1497,7 +1497,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -1517,7 +1517,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
Example:
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
@ -1532,9 +1532,9 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
}
|
||||
```
|
||||
|
||||
If filters are provided along with a list of IDs, this method deletes the
|
||||
intersection of the two query results (documents that match the filters and
|
||||
have their ID in the list).
|
||||
If filters are provided along with a list of IDs, this method deletes the
|
||||
intersection of the two query results (documents that match the filters and
|
||||
have their ID in the list).
|
||||
:param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
|
||||
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
|
||||
:return: None
|
||||
@ -1560,7 +1560,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -1580,7 +1580,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
Example:
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
@ -1695,7 +1695,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
|
||||
def query_by_embedding(
|
||||
self,
|
||||
query_emb: np.ndarray,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
@ -1717,7 +1717,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
Example:
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
@ -1742,10 +1742,10 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
|
||||
}
|
||||
```
|
||||
|
||||
To use the same logical operator multiple times on the same level, logical operators take
|
||||
optionally a list of dictionaries as value.
|
||||
To use the same logical operator multiple times on the same level, logical operators take
|
||||
optionally a list of dictionaries as value.
|
||||
|
||||
Example:
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$or": [
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from haystack.nodes.retriever import BaseRetriever
|
||||
@ -308,7 +308,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
retriever: "BaseRetriever",
|
||||
index: Optional[str] = None,
|
||||
update_existing_embeddings: bool = True,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
|
||||
batch_size: int = 10_000,
|
||||
):
|
||||
"""
|
||||
@ -379,7 +379,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
def get_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -396,7 +396,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
def get_all_documents_generator(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -447,7 +447,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
doc.embedding = self.faiss_indexes[index].reconstruct(int(doc.meta["vector_id"]))
|
||||
return documents
|
||||
|
||||
def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int:
|
||||
def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int:
|
||||
"""
|
||||
Return the count of embeddings in the document store.
|
||||
"""
|
||||
@ -486,7 +486,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
def delete_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -507,7 +507,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -546,7 +546,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
def query_by_embedding(
|
||||
self,
|
||||
query_emb: np.ndarray,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
|
||||
@ -1,9 +1,11 @@
|
||||
from typing import Union, List, Dict
|
||||
from typing import Union, List, Dict, Optional, Tuple
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import defaultdict
|
||||
|
||||
from haystack.document_stores.utils import convert_date_to_rfc3339
|
||||
|
||||
def nested_defaultdict():
|
||||
|
||||
def nested_defaultdict() -> defaultdict:
|
||||
"""
|
||||
Data structure that recursively adds a dictionary as value if a key does not exist. Advantage: In nested dictionary
|
||||
structures, we don't need to check if a key already exists (which can become hard to maintain in nested dictionaries
|
||||
@ -81,17 +83,17 @@ class LogicalFilterClause(ABC):
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, conditions: List["LogicalFilterClause"]):
|
||||
def __init__(self, conditions: List[Union["LogicalFilterClause", "ComparisonOperation"]]):
|
||||
self.conditions = conditions
|
||||
|
||||
@classmethod
|
||||
def parse(cls, filter_term: Union[dict, List[dict]]):
|
||||
def parse(cls, filter_term: Union[dict, List[dict]]) -> Union["LogicalFilterClause", "ComparisonOperation"]:
|
||||
"""
|
||||
Parses a filter dictionary/list and returns a LogicalFilterClause instance.
|
||||
|
||||
:param filter_term: Dictionary or list that contains the filter definition.
|
||||
"""
|
||||
conditions = []
|
||||
conditions: List[Union[LogicalFilterClause, ComparisonOperation]] = []
|
||||
|
||||
if isinstance(filter_term, dict):
|
||||
filter_term = [filter_term]
|
||||
@ -122,7 +124,14 @@ class LogicalFilterClause(ABC):
|
||||
"""
|
||||
pass
|
||||
|
||||
def _merge_es_range_queries(self, conditions: List[Dict]) -> List[Dict]:
|
||||
@abstractmethod
|
||||
def convert_to_weaviate(self):
|
||||
"""
|
||||
Converts the LogicalFilterClause instance to a Weaviate filter.
|
||||
"""
|
||||
pass
|
||||
|
||||
def _merge_es_range_queries(self, conditions: List[Dict]) -> List[Dict[str, Dict]]:
|
||||
"""
|
||||
Merges Elasticsearch range queries that perform on the same metadata field.
|
||||
"""
|
||||
@ -142,14 +151,23 @@ class LogicalFilterClause(ABC):
|
||||
|
||||
return conditions
|
||||
|
||||
@abstractmethod
|
||||
def invert(self) -> Union["LogicalFilterClause", "ComparisonOperation"]:
|
||||
"""
|
||||
Inverts the LogicalOperation instance.
|
||||
Necessary for Weaviate as Weaviate doesn't seem to support the 'Not' operator anymore.
|
||||
(https://github.com/semi-technologies/weaviate/issues/1717)
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ComparisonOperation(ABC):
|
||||
def __init__(self, field_name: str, comparison_value: Union[str, float, List]):
|
||||
def __init__(self, field_name: str, comparison_value: Union[str, int, float, bool, List]):
|
||||
self.field_name = field_name
|
||||
self.comparison_value = comparison_value
|
||||
|
||||
@classmethod
|
||||
def parse(cls, field_name, comparison_clause: Union[Dict, List, str, float]):
|
||||
def parse(cls, field_name, comparison_clause: Union[Dict, List, str, float]) -> List["ComparisonOperation"]:
|
||||
comparison_operations: List[ComparisonOperation] = []
|
||||
|
||||
if isinstance(comparison_clause, dict):
|
||||
@ -187,107 +205,273 @@ class ComparisonOperation(ABC):
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def convert_to_weaviate(self):
|
||||
"""
|
||||
Converts the ComparisonOperation instance to a Weaviate comparison operator.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def invert(self) -> "ComparisonOperation":
|
||||
"""
|
||||
Inverts the ComparisonOperation.
|
||||
Necessary for Weaviate as Weaviate doesn't seem to support the 'Not' operator anymore.
|
||||
(https://github.com/semi-technologies/weaviate/issues/1717)
|
||||
"""
|
||||
pass
|
||||
|
||||
def _get_weaviate_datatype(
|
||||
self, value: Optional[Union[str, int, float, bool]] = None
|
||||
) -> Tuple[str, Union[str, int, float, bool]]:
|
||||
"""
|
||||
Determines the type of the comparison value and converts it to RFC3339 format if it is as date,
|
||||
as Weaviate requires dates to be in RFC3339 format including the time and timezone.
|
||||
|
||||
"""
|
||||
if value is None:
|
||||
assert not isinstance(self.comparison_value, list) # Necessary for mypy
|
||||
value = self.comparison_value
|
||||
|
||||
if isinstance(value, str):
|
||||
# Check if comparison value is a date
|
||||
try:
|
||||
value = convert_date_to_rfc3339(value)
|
||||
data_type = "valueDate"
|
||||
# Comparison value is a plain string
|
||||
except ValueError:
|
||||
data_type = "valueString"
|
||||
elif isinstance(value, int):
|
||||
data_type = "valueInt"
|
||||
elif isinstance(value, float):
|
||||
data_type = "valueNumber"
|
||||
elif isinstance(value, bool):
|
||||
data_type = "valueBoolean"
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported data type of comparison value for {self.__class__.__name__}."
|
||||
f"Value needs to be of type str, int, float, or bool."
|
||||
)
|
||||
|
||||
return data_type, value
|
||||
|
||||
|
||||
class NotOperation(LogicalFilterClause):
|
||||
"""
|
||||
Handles conversion of logical 'NOT' operations.
|
||||
"""
|
||||
|
||||
def convert_to_elasticsearch(self):
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict]:
|
||||
conditions = [condition.convert_to_elasticsearch() for condition in self.conditions]
|
||||
conditions = self._merge_es_range_queries(conditions)
|
||||
return {"bool": {"must_not": conditions}}
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[str, int, float, bool, List[Dict]]]:
|
||||
conditions = [condition.invert().convert_to_weaviate() for condition in self.conditions]
|
||||
if len(conditions) > 1:
|
||||
# Conditions in self.conditions are by default combined with AND which becomes OR according to DeMorgan
|
||||
return {"operator": "Or", "operands": conditions}
|
||||
else:
|
||||
return conditions[0]
|
||||
|
||||
def invert(self) -> Union[LogicalFilterClause, ComparisonOperation]:
|
||||
# This method is called when a "$not" operation is embedded in another "$not" operation. Therefore, we don't
|
||||
# invert the operations here, as two "$not" operation annihilate each other.
|
||||
# (If we have more than one condition, we return an AndOperation, the default logical operation for combining
|
||||
# multiple conditions.)
|
||||
if len(self.conditions) > 1:
|
||||
return AndOperation(self.conditions)
|
||||
else:
|
||||
return self.conditions[0]
|
||||
|
||||
|
||||
class AndOperation(LogicalFilterClause):
|
||||
"""
|
||||
Handles conversion of logical 'AND' operations.
|
||||
"""
|
||||
|
||||
def convert_to_elasticsearch(self):
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict]:
|
||||
conditions = [condition.convert_to_elasticsearch() for condition in self.conditions]
|
||||
conditions = self._merge_es_range_queries(conditions)
|
||||
return {"bool": {"must": conditions}}
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
|
||||
conditions = [condition.convert_to_weaviate() for condition in self.conditions]
|
||||
return {"operator": "And", "operands": conditions}
|
||||
|
||||
def invert(self) -> "OrOperation":
|
||||
return OrOperation([condition.invert() for condition in self.conditions])
|
||||
|
||||
|
||||
class OrOperation(LogicalFilterClause):
|
||||
"""
|
||||
Handles conversion of logical 'OR' operations.
|
||||
"""
|
||||
|
||||
def convert_to_elasticsearch(self):
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict]:
|
||||
conditions = [condition.convert_to_elasticsearch() for condition in self.conditions]
|
||||
conditions = self._merge_es_range_queries(conditions)
|
||||
return {"bool": {"should": conditions}}
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
|
||||
conditions = [condition.convert_to_weaviate() for condition in self.conditions]
|
||||
return {"operator": "Or", "operands": conditions}
|
||||
|
||||
def invert(self) -> AndOperation:
|
||||
return AndOperation([condition.invert() for condition in self.conditions])
|
||||
|
||||
|
||||
class EqOperation(ComparisonOperation):
|
||||
"""
|
||||
Handles conversion of the '$eq' comparison operation.
|
||||
"""
|
||||
|
||||
def convert_to_elasticsearch(self):
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Union[str, int, float, bool]]]:
|
||||
assert not isinstance(self.comparison_value, list), "Use '$in' operation for lists as comparison values."
|
||||
return {"term": {self.field_name: self.comparison_value}}
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, bool]]:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype()
|
||||
return {"path": [self.field_name], "operator": "Equal", comp_value_type: comp_value}
|
||||
|
||||
def invert(self) -> "NeOperation":
|
||||
return NeOperation(self.field_name, self.comparison_value)
|
||||
|
||||
|
||||
class InOperation(ComparisonOperation):
|
||||
"""
|
||||
Handles conversion of the '$in' comparison operation.
|
||||
"""
|
||||
|
||||
def convert_to_elasticsearch(self):
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, List]]:
|
||||
assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list."
|
||||
return {"terms": {self.field_name: self.comparison_value}}
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
|
||||
filter_dict: Dict[str, Union[str, List[Dict]]] = {"operator": "Or", "operands": []}
|
||||
assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list."
|
||||
for value in self.comparison_value:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype(value)
|
||||
assert isinstance(filter_dict["operands"], list) # Necessary for mypy
|
||||
filter_dict["operands"].append(
|
||||
{"path": [self.field_name], "operator": "Equal", comp_value_type: comp_value}
|
||||
)
|
||||
|
||||
return filter_dict
|
||||
|
||||
def invert(self) -> "NinOperation":
|
||||
return NinOperation(self.field_name, self.comparison_value)
|
||||
|
||||
|
||||
class NeOperation(ComparisonOperation):
|
||||
"""
|
||||
Handles conversion of the '$ne' comparison operation.
|
||||
"""
|
||||
|
||||
def convert_to_elasticsearch(self):
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Dict[str, Union[str, int, float, bool]]]]]:
|
||||
assert not isinstance(self.comparison_value, list), "Use '$nin' operation for lists as comparison values."
|
||||
return {"bool": {"must_not": {"term": {self.field_name: self.comparison_value}}}}
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, bool]]:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype()
|
||||
return {"path": [self.field_name], "operator": "NotEqual", comp_value_type: comp_value}
|
||||
|
||||
def invert(self) -> "EqOperation":
|
||||
return EqOperation(self.field_name, self.comparison_value)
|
||||
|
||||
|
||||
class NinOperation(ComparisonOperation):
|
||||
"""
|
||||
Handles conversion of the '$nin' comparison operation.
|
||||
"""
|
||||
|
||||
def convert_to_elasticsearch(self):
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Dict[str, List]]]]:
|
||||
assert isinstance(self.comparison_value, list), "'$nin' operation requires comparison value to be a list."
|
||||
return {"bool": {"must_not": {"terms": {self.field_name: self.comparison_value}}}}
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]:
|
||||
filter_dict: Dict[str, Union[str, List[Dict]]] = {"operator": "And", "operands": []}
|
||||
assert isinstance(self.comparison_value, list), "'$nin' operation requires comparison value to be a list."
|
||||
for value in self.comparison_value:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype(value)
|
||||
assert isinstance(filter_dict["operands"], list) # Necessary for mypy
|
||||
filter_dict["operands"].append(
|
||||
{"path": [self.field_name], "operator": "NotEqual", comp_value_type: comp_value}
|
||||
)
|
||||
|
||||
return filter_dict
|
||||
|
||||
def invert(self) -> "InOperation":
|
||||
return InOperation(self.field_name, self.comparison_value)
|
||||
|
||||
|
||||
class GtOperation(ComparisonOperation):
|
||||
"""
|
||||
Handles conversion of the '$gt' comparison operation.
|
||||
"""
|
||||
|
||||
def convert_to_elasticsearch(self):
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
|
||||
assert not isinstance(self.comparison_value, list), "Comparison value for '$gt' operation must not be a list."
|
||||
return {"range": {self.field_name: {"gt": self.comparison_value}}}
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype()
|
||||
assert not isinstance(comp_value, list), "Comparison value for '$gt' operation must not be a list."
|
||||
return {"path": [self.field_name], "operator": "GreaterThan", comp_value_type: comp_value}
|
||||
|
||||
def invert(self) -> "LteOperation":
|
||||
return LteOperation(self.field_name, self.comparison_value)
|
||||
|
||||
|
||||
class GteOperation(ComparisonOperation):
|
||||
"""
|
||||
Handles conversion of the '$gte' comparison operation.
|
||||
"""
|
||||
|
||||
def convert_to_elasticsearch(self):
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
|
||||
assert not isinstance(self.comparison_value, list), "Comparison value for '$gte' operation must not be a list."
|
||||
return {"range": {self.field_name: {"gte": self.comparison_value}}}
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype()
|
||||
assert not isinstance(comp_value, list), "Comparison value for '$gte' operation must not be a list."
|
||||
return {"path": [self.field_name], "operator": "GreaterThanEqual", comp_value_type: comp_value}
|
||||
|
||||
def invert(self) -> "LtOperation":
|
||||
return LtOperation(self.field_name, self.comparison_value)
|
||||
|
||||
|
||||
class LtOperation(ComparisonOperation):
|
||||
"""
|
||||
Handles conversion of the '$lt' comparison operation.
|
||||
"""
|
||||
|
||||
def convert_to_elasticsearch(self):
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
|
||||
assert not isinstance(self.comparison_value, list), "Comparison value for '$lt' operation must not be a list."
|
||||
return {"range": {self.field_name: {"lt": self.comparison_value}}}
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype()
|
||||
assert not isinstance(comp_value, list), "Comparison value for '$lt' operation must not be a list."
|
||||
return {"path": [self.field_name], "operator": "LessThan", comp_value_type: comp_value}
|
||||
|
||||
def invert(self) -> "GteOperation":
|
||||
return GteOperation(self.field_name, self.comparison_value)
|
||||
|
||||
|
||||
class LteOperation(ComparisonOperation):
|
||||
"""
|
||||
Handles conversion of the '$lte' comparison operation.
|
||||
"""
|
||||
|
||||
def convert_to_elasticsearch(self):
|
||||
def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]:
|
||||
assert not isinstance(self.comparison_value, list), "Comparison value for '$lte' operation must not be a list."
|
||||
return {"range": {self.field_name: {"lte": self.comparison_value}}}
|
||||
|
||||
def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]:
|
||||
comp_value_type, comp_value = self._get_weaviate_datatype()
|
||||
assert not isinstance(comp_value, list), "Comparison value for '$lte' operation must not be a list."
|
||||
return {"path": [self.field_name], "operator": "LessThanEqual", comp_value_type: comp_value}
|
||||
|
||||
def invert(self) -> "GtOperation":
|
||||
return GtOperation(self.field_name, self.comparison_value)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union, Generator
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union, Generator, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from haystack.nodes.retriever import BaseRetriever
|
||||
@ -291,7 +291,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
def query_by_embedding(
|
||||
self,
|
||||
query_emb: np.ndarray,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
@ -337,7 +337,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
self,
|
||||
retriever: "BaseRetriever",
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
|
||||
update_existing_embeddings: bool = True,
|
||||
batch_size: int = 10_000,
|
||||
):
|
||||
@ -390,7 +390,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
|
||||
def get_document_count(
|
||||
self,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
|
||||
index: Optional[str] = None,
|
||||
only_documents_without_embedding: bool = False,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -427,7 +427,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
def _query(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
|
||||
return_embedding: Optional[bool] = None,
|
||||
only_documents_without_embedding: bool = False,
|
||||
):
|
||||
@ -464,7 +464,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
def get_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -490,7 +490,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
def get_all_documents_generator(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -514,7 +514,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
def get_all_labels(
|
||||
self,
|
||||
index: str = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
) -> List[Label]:
|
||||
"""
|
||||
@ -544,7 +544,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
def delete_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -569,7 +569,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -603,7 +603,7 @@ class InMemoryDocumentStore(BaseDocumentStore):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
|
||||
@ -307,7 +307,7 @@ class MilvusDocumentStore(SQLDocumentStore):
|
||||
index: Optional[str] = None,
|
||||
batch_size: int = 10_000,
|
||||
update_existing_embeddings: bool = True,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
|
||||
):
|
||||
"""
|
||||
Updates the embeddings in the the document store using the encoding model specified in the retriever.
|
||||
@ -374,7 +374,7 @@ class MilvusDocumentStore(SQLDocumentStore):
|
||||
def query_by_embedding(
|
||||
self,
|
||||
query_emb: np.ndarray,
|
||||
filters: Optional[dict] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
@ -440,7 +440,7 @@ class MilvusDocumentStore(SQLDocumentStore):
|
||||
def delete_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -465,7 +465,7 @@ class MilvusDocumentStore(SQLDocumentStore):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -508,7 +508,7 @@ class MilvusDocumentStore(SQLDocumentStore):
|
||||
def get_all_documents_generator(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -541,7 +541,7 @@ class MilvusDocumentStore(SQLDocumentStore):
|
||||
def get_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -676,7 +676,11 @@ class MilvusDocumentStore(SQLDocumentStore):
|
||||
|
||||
return vectors
|
||||
|
||||
def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int:
|
||||
def get_embedding_count(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore
|
||||
) -> int:
|
||||
"""
|
||||
Return the count of embeddings in the document store.
|
||||
"""
|
||||
|
||||
@ -375,7 +375,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
|
||||
index: Optional[str] = None,
|
||||
batch_size: int = 10_000,
|
||||
update_existing_embeddings: bool = True,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
|
||||
):
|
||||
"""
|
||||
Updates the embeddings in the the document store using the encoding model specified in the retriever.
|
||||
@ -457,7 +457,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
|
||||
def query_by_embedding(
|
||||
self,
|
||||
query_emb: np.ndarray,
|
||||
filters: Optional[dict] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
@ -538,7 +538,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -571,7 +571,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
|
||||
def get_all_documents_generator(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -604,7 +604,7 @@ class Milvus2DocumentStore(SQLDocumentStore):
|
||||
def get_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
|
||||
@ -215,7 +215,7 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
def get_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -233,7 +233,7 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
def get_all_documents_generator(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -271,7 +271,7 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
def _query(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
|
||||
vector_ids: Optional[List[str]] = None,
|
||||
only_documents_without_embedding: bool = False,
|
||||
batch_size: int = 10_000,
|
||||
@ -521,7 +521,7 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
|
||||
def get_document_count(
|
||||
self,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
|
||||
index: Optional[str] = None,
|
||||
only_documents_without_embedding: bool = False,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -609,7 +609,7 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
def delete_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -634,7 +634,7 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -674,7 +674,7 @@ class SQLDocumentStore(BaseDocumentStore):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
|
||||
@ -2,6 +2,7 @@ from typing import Dict, List, Optional, Tuple, Union, Generator
|
||||
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
from haystack.schema import Document, Label, Answer, Span
|
||||
from haystack.nodes.preprocessor import PreProcessor
|
||||
@ -250,3 +251,23 @@ def _extract_docs_and_labels_from_dict(
|
||||
labels.append(label)
|
||||
|
||||
return docs, labels, problematic_ids
|
||||
|
||||
|
||||
def convert_date_to_rfc3339(date: str) -> str:
|
||||
"""
|
||||
Converts a date to RFC3339 format, as Weaviate requires dates to be in RFC3339 format including the time and
|
||||
timezone.
|
||||
|
||||
If the provided date string does not contain a time and/or timezone, we use 00:00 as default time
|
||||
and UTC as default time zone.
|
||||
|
||||
This method cannot be part of WeaviateDocumentStore, as this would result in a circular import between weaviate.py
|
||||
and filter_utils.py.
|
||||
"""
|
||||
parsed_datetime = datetime.fromisoformat(date)
|
||||
if parsed_datetime.utcoffset() is None:
|
||||
converted_date = parsed_datetime.isoformat() + "Z"
|
||||
else:
|
||||
converted_date = parsed_datetime.isoformat()
|
||||
|
||||
return converted_date
|
||||
|
||||
@ -2,6 +2,7 @@ import hashlib
|
||||
import re
|
||||
import uuid
|
||||
from typing import Dict, Generator, List, Optional, Union
|
||||
from datetime import datetime
|
||||
|
||||
import logging
|
||||
import json
|
||||
@ -11,6 +12,8 @@ from tqdm import tqdm
|
||||
from haystack.schema import Document
|
||||
from haystack.document_stores import BaseDocumentStore
|
||||
from haystack.document_stores.base import get_batches_from_generator
|
||||
from haystack.document_stores.filter_utils import LogicalFilterClause
|
||||
from haystack.document_stores.utils import convert_date_to_rfc3339
|
||||
|
||||
try:
|
||||
from weaviate import client, AuthClientPassword
|
||||
@ -225,8 +228,8 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
content = json.loads(str(props.get(self.content_field)))
|
||||
|
||||
content_type = None
|
||||
if props.get("contenttype") is not None:
|
||||
content_type = str(props.pop("contenttype"))
|
||||
if props.get("content_type") is not None:
|
||||
content_type = str(props.pop("content_type"))
|
||||
|
||||
# Weaviate creates "_additional" key for semantic search
|
||||
if "_additional" in props:
|
||||
@ -337,30 +340,61 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
|
||||
return cur_properties
|
||||
|
||||
def _build_filter_clause(self, filters: Dict[str, List[str]]) -> dict:
|
||||
def _get_date_properties(self, index: Optional[str] = None) -> List[str]:
|
||||
"""
|
||||
Transform Haystack filter conditions to Weaviate where filter clauses.
|
||||
Get all existing properties of type 'date' in the schema.
|
||||
"""
|
||||
weaviate_filters = []
|
||||
weaviate_filter = {}
|
||||
for key, values in filters.items():
|
||||
for value in values:
|
||||
weaviate_filter = {"path": [key], "operator": "Equal", "valueString": value}
|
||||
weaviate_filters.append(weaviate_filter)
|
||||
if len(weaviate_filters) > 1:
|
||||
filter_dict = {"operator": "Or", "operands": weaviate_filters}
|
||||
return filter_dict
|
||||
else:
|
||||
return weaviate_filter
|
||||
index = self._sanitize_index_name(index) or self.index
|
||||
cur_properties = []
|
||||
for class_item in self.weaviate_client.schema.get()["classes"]:
|
||||
if class_item["class"] == index:
|
||||
cur_properties = [item["name"] for item in class_item["properties"] if item["dataType"][0] == "date"]
|
||||
|
||||
def _update_schema(self, new_prop: str, index: Optional[str] = None):
|
||||
return cur_properties
|
||||
|
||||
def _update_schema(
|
||||
self, new_prop: str, property_value: Union[List, str, int, float, bool], index: Optional[str] = None
|
||||
):
|
||||
"""
|
||||
Updates the schema with a new property.
|
||||
"""
|
||||
index = self._sanitize_index_name(index) or self.index
|
||||
property_dict = {"dataType": ["string"], "description": f"dynamic property {new_prop}", "name": new_prop}
|
||||
data_type = self._get_weaviate_type_of_value(property_value)
|
||||
|
||||
property_dict = {"dataType": [data_type], "description": f"dynamic property {new_prop}", "name": new_prop}
|
||||
self.weaviate_client.schema.property.create(index, property_dict)
|
||||
|
||||
@staticmethod
|
||||
def _get_weaviate_type_of_value(value: Union[List, str, int, float, bool]) -> str:
|
||||
"""
|
||||
Infers corresponding Weaviate data type for a value.
|
||||
"""
|
||||
data_type = ""
|
||||
list_of_values = False
|
||||
if isinstance(value, list):
|
||||
list_of_values = True
|
||||
value = value[0]
|
||||
|
||||
if isinstance(value, str):
|
||||
# If the value is parsable by datetime, it is a date
|
||||
try:
|
||||
convert_date_to_rfc3339(value)
|
||||
data_type = "date"
|
||||
# Otherwise, the value is a string
|
||||
except ValueError:
|
||||
data_type = "string"
|
||||
elif isinstance(value, int):
|
||||
data_type = "int"
|
||||
elif isinstance(value, float):
|
||||
data_type = "number"
|
||||
elif isinstance(value, bool):
|
||||
data_type = "boolean"
|
||||
|
||||
if list_of_values:
|
||||
data_type += "[]"
|
||||
|
||||
return data_type
|
||||
|
||||
def _check_document(self, cur_props: List[str], doc: dict) -> List[str]:
|
||||
"""
|
||||
Find the properties in the document that don't exist in the existing schema.
|
||||
@ -458,9 +492,6 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
if self.similarity == "cosine":
|
||||
self.normalize_embedding(vector)
|
||||
|
||||
# rename as weaviate doesn't like "_" in field names
|
||||
_doc["contenttype"] = _doc.pop("content_type")
|
||||
|
||||
# Converting content to JSON-string as Weaviate doesn't allow other nested list for tables
|
||||
_doc["content"] = json.dumps(_doc["content"])
|
||||
|
||||
@ -469,9 +500,14 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
missing_props = self._check_document(current_properties, _doc)
|
||||
if missing_props:
|
||||
for property in missing_props:
|
||||
self._update_schema(property, index)
|
||||
self._update_schema(property, _doc[property], index)
|
||||
current_properties.append(property)
|
||||
|
||||
# Weaviate requires dates to be in RFC3339 format
|
||||
date_fields = self._get_date_properties(index)
|
||||
for date_field in date_fields:
|
||||
_doc[date_field] = convert_date_to_rfc3339(_doc[date_field])
|
||||
|
||||
docs_batch.add(_doc, class_name=index, uuid=doc_id, vector=vector)
|
||||
|
||||
# Ingest a batch of documents
|
||||
@ -489,23 +525,43 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
progress_bar.update(batch_size)
|
||||
progress_bar.close()
|
||||
|
||||
def update_document_meta(self, id: str, meta: Dict[str, str], index: str = None):
|
||||
def update_document_meta(self, id: str, meta: Dict[str, Union[List, str, int, float, bool]], index: str = None):
|
||||
"""
|
||||
Update the metadata dictionary of a document by specifying its string id.
|
||||
Overwrites only the specified fields, the unspecified ones remain unchanged.
|
||||
"""
|
||||
if not index:
|
||||
index = self.index
|
||||
|
||||
current_properties = self._get_current_properties(index)
|
||||
|
||||
# Check if the new metadata contains additional properties and append them to the schema
|
||||
missing_props = self._check_document(current_properties, meta)
|
||||
if missing_props:
|
||||
for property in missing_props:
|
||||
self._update_schema(property, meta[property], index)
|
||||
current_properties.append(property)
|
||||
|
||||
# Weaviate requires dates to be in RFC3339 format
|
||||
date_fields = self._get_date_properties(index)
|
||||
for date_field in date_fields:
|
||||
if isinstance(meta[date_field], str):
|
||||
meta[date_field] = convert_date_to_rfc3339(str(meta[date_field]))
|
||||
|
||||
self.weaviate_client.data_object.update(meta, class_name=index, uuid=id)
|
||||
|
||||
def get_embedding_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
|
||||
def get_embedding_count(
|
||||
self, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None
|
||||
) -> int:
|
||||
"""
|
||||
Return the number of embeddings in the document store, which is the same as the number of documents since every document has a default embedding
|
||||
Return the number of embeddings in the document store, which is the same as the number of documents since
|
||||
every document has a default embedding.
|
||||
"""
|
||||
return self.get_document_count(filters=filters, index=index)
|
||||
|
||||
def get_document_count(
|
||||
self,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
index: Optional[str] = None,
|
||||
only_documents_without_embedding: bool = False,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -522,7 +578,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
index = self._sanitize_index_name(index) or self.index
|
||||
doc_count = 0
|
||||
if filters:
|
||||
filter_dict = self._build_filter_clause(filters=filters)
|
||||
filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
|
||||
result = (
|
||||
self.weaviate_client.query.aggregate(index).with_fields("meta { count }").with_where(filter_dict).do()
|
||||
)
|
||||
@ -538,7 +594,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
def get_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -548,8 +604,32 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
|
||||
:param index: Name of the index to get the documents from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the documents to return.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
:param return_embedding: Whether to return the document embeddings.
|
||||
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
|
||||
"""
|
||||
@ -566,7 +646,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
def _get_all_documents_in_index(
|
||||
self,
|
||||
index: Optional[str],
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
batch_size: int = 10_000,
|
||||
only_documents_without_embedding: bool = False,
|
||||
) -> Generator[dict, None, None]:
|
||||
@ -580,7 +660,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
properties.append("_additional {id, certainty, vector}")
|
||||
|
||||
if filters:
|
||||
filter_dict = self._build_filter_clause(filters=filters)
|
||||
filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
|
||||
result = (
|
||||
self.weaviate_client.query.get(class_name=index, properties=properties).with_where(filter_dict).do()
|
||||
)
|
||||
@ -597,7 +677,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
def get_all_documents_generator(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
batch_size: int = 10_000,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
@ -609,8 +689,32 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
|
||||
:param index: Name of the index to get the documents from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param filters: Optional filters to narrow down the documents to return.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
:param return_embedding: Whether to return the document embeddings.
|
||||
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
|
||||
"""
|
||||
@ -630,7 +734,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
def query(
|
||||
self,
|
||||
query: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
top_k: int = 10,
|
||||
custom_query: Optional[str] = None,
|
||||
index: Optional[str] = None,
|
||||
@ -640,7 +744,69 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
that are most relevant to the query as defined by Weaviate semantic search.
|
||||
|
||||
:param query: The query
|
||||
:param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
# or simpler using default operators
|
||||
filters = {
|
||||
"type": "article",
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": ["economy", "politics"],
|
||||
"publisher": "nytimes"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
To use the same logical operator multiple times on the same level, logical operators take
|
||||
optionally a list of dictionaries as value.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$or": [
|
||||
{
|
||||
"$and": {
|
||||
"Type": "News Paper",
|
||||
"Date": {
|
||||
"$lt": "2019-01-01"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$and": {
|
||||
"Type": "Blog Post",
|
||||
"Date": {
|
||||
"$gte": "2019-01-01"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
:param top_k: How many documents to return per query.
|
||||
:param custom_query: Custom query that will executed using query.raw method, for more details refer
|
||||
https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html
|
||||
@ -655,7 +821,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
if custom_query:
|
||||
query_output = self.weaviate_client.query.raw(custom_query)
|
||||
elif filters:
|
||||
filter_dict = self._build_filter_clause(filters)
|
||||
filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
|
||||
query_output = (
|
||||
self.weaviate_client.query.get(class_name=index, properties=properties)
|
||||
.with_where(filter_dict)
|
||||
@ -684,7 +850,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
def query_by_embedding(
|
||||
self,
|
||||
query_emb: np.ndarray,
|
||||
filters: Optional[dict] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
top_k: int = 10,
|
||||
index: Optional[str] = None,
|
||||
return_embedding: Optional[bool] = None,
|
||||
@ -694,8 +860,69 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
|
||||
|
||||
:param query_emb: Embedding of the query (e.g. gathered from DPR)
|
||||
:param filters: Optional filters to narrow down the search space.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
# or simpler using default operators
|
||||
filters = {
|
||||
"type": "article",
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": ["economy", "politics"],
|
||||
"publisher": "nytimes"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
To use the same logical operator multiple times on the same level, logical operators take
|
||||
optionally a list of dictionaries as value.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$or": [
|
||||
{
|
||||
"$and": {
|
||||
"Type": "News Paper",
|
||||
"Date": {
|
||||
"$lt": "2019-01-01"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$and": {
|
||||
"Type": "Blog Post",
|
||||
"Date": {
|
||||
"$gte": "2019-01-01"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
:param top_k: How many documents to return
|
||||
:param index: index name for storing the docs and metadata
|
||||
:param return_embedding: To return document embedding
|
||||
@ -719,7 +946,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
|
||||
query_string = {"vector": query_emb}
|
||||
if filters:
|
||||
filter_dict = self._build_filter_clause(filters)
|
||||
filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate()
|
||||
query_output = (
|
||||
self.weaviate_client.query.get(class_name=index, properties=properties)
|
||||
.with_where(filter_dict)
|
||||
@ -751,7 +978,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
self,
|
||||
retriever,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
update_existing_embeddings: bool = True,
|
||||
batch_size: int = 10_000,
|
||||
):
|
||||
@ -763,8 +990,32 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
:param index: Index name to update
|
||||
:param update_existing_embeddings: Weaviate mandates an embedding while creating the document itself.
|
||||
This option must be always true for weaviate and it will update the embeddings for all the documents.
|
||||
:param filters: Optional filters to narrow down the documents for which embeddings are to be updated.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
:param batch_size: When working with large number of documents, batching can help reduce memory footprint.
|
||||
:return: None
|
||||
"""
|
||||
@ -808,13 +1059,38 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
def delete_all_documents(
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
Delete documents in an index. All documents are deleted if no filters are passed.
|
||||
:param index: Index name to delete the document from.
|
||||
:param filters: Optional filters to narrow down the documents to be deleted.
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
:return: None
|
||||
"""
|
||||
if headers:
|
||||
@ -832,7 +1108,7 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
self,
|
||||
index: Optional[str] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -841,11 +1117,35 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
:param index: Index name to delete the document from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
:param ids: Optional list of IDs to narrow down the documents to be deleted.
|
||||
:param filters: Optional filters to narrow down the documents to be deleted.
|
||||
Example filters: {"name": ["some", "more"], "category": ["only_one"]}.
|
||||
If filters are provided along with a list of IDs, this method deletes the
|
||||
intersection of the two query results (documents that match the filters and
|
||||
have their ID in the list).
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
If filters are provided along with a list of IDs, this method deletes the
|
||||
intersection of the two query results (documents that match the filters and
|
||||
have their ID in the list).
|
||||
:return: None
|
||||
"""
|
||||
if headers:
|
||||
|
||||
@ -873,7 +873,7 @@ class FARMReader(BaseReader):
|
||||
)
|
||||
|
||||
# extract all questions for evaluation
|
||||
filters = {"origin": [label_origin]}
|
||||
filters: Dict = {"origin": [label_origin]}
|
||||
|
||||
labels = document_store.get_all_labels(index=label_index, filters=filters)
|
||||
|
||||
|
||||
@ -124,7 +124,7 @@ class BaseRetriever(BaseComponent):
|
||||
"""
|
||||
|
||||
# Extract all questions for evaluation
|
||||
filters = {"origin": [label_origin]}
|
||||
filters: Dict = {"origin": [label_origin]}
|
||||
|
||||
timed_retrieve = self.timing(self.retrieve, "retrieve_time")
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Dict, Generator, List, Optional
|
||||
from typing import Any, Dict, Generator, List, Optional, Union
|
||||
|
||||
try:
|
||||
from typing import Literal
|
||||
@ -266,7 +266,7 @@ class IndexClient:
|
||||
def query(
|
||||
self,
|
||||
query: Optional[str] = None,
|
||||
filters: Optional[Dict[str, List[str]]] = None,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
|
||||
top_k: int = 10,
|
||||
custom_query: Optional[str] = None,
|
||||
query_emb: Optional[List[float]] = None,
|
||||
|
||||
@ -323,7 +323,7 @@ def test_docs_xs():
|
||||
"meta_field": "test2",
|
||||
"name": "filename2",
|
||||
"date_field": "2019-10-01",
|
||||
"numeric_field": 5,
|
||||
"numeric_field": 5.0,
|
||||
},
|
||||
# Document object for a doc
|
||||
Document(
|
||||
@ -332,11 +332,11 @@ def test_docs_xs():
|
||||
),
|
||||
Document(
|
||||
content="My name is Camila and I live in Madrid",
|
||||
meta={"meta_field": "test4", "name": "filename4", "date_field": "2021-02-01", "numeric_field": 3},
|
||||
meta={"meta_field": "test4", "name": "filename4", "date_field": "2021-02-01", "numeric_field": 3.0},
|
||||
),
|
||||
Document(
|
||||
content="My name is Matteo and I live in Rome",
|
||||
meta={"meta_field": "test5", "name": "filename5", "date_field": "2019-01-01", "numeric_field": 0},
|
||||
meta={"meta_field": "test5", "name": "filename5", "date_field": "2019-01-01", "numeric_field": 0.0},
|
||||
),
|
||||
]
|
||||
|
||||
@ -530,16 +530,6 @@ def document_store_with_docs(request, test_docs_xs, tmp_path):
|
||||
document_store = get_document_store(
|
||||
document_store_type=request.param, embedding_dim=embedding_dim.args[0], tmp_path=tmp_path
|
||||
)
|
||||
# TODO: remove the following part once we allow numbers as metadatfield value in WeaviateDocumentStore
|
||||
if request.param == "weaviate":
|
||||
for doc in test_docs_xs:
|
||||
if isinstance(doc, Document):
|
||||
doc.meta["numeric_field"] = str(doc.meta["numeric_field"])
|
||||
else:
|
||||
if "meta" in doc:
|
||||
doc["meta"]["numeric_field"] = str(doc["meta"]["numeric_field"])
|
||||
else:
|
||||
doc["numeric_field"] = str(doc["numeric_field"])
|
||||
document_store.write_documents(test_docs_xs)
|
||||
yield document_store
|
||||
document_store.delete_documents()
|
||||
|
||||
@ -216,7 +216,7 @@ def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs)
|
||||
assert len(documents) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
|
||||
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch", "weaviate"], indirect=True)
|
||||
def test_extended_filter(document_store_with_docs):
|
||||
# Test comparison operators individually
|
||||
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test1"}})
|
||||
@ -235,16 +235,16 @@ def test_extended_filter(document_store_with_docs):
|
||||
documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$nin": ["test1", "test2", "n.a."]}})
|
||||
assert len(documents) == 3
|
||||
|
||||
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3}})
|
||||
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3.0}})
|
||||
assert len(documents) == 3
|
||||
|
||||
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3}})
|
||||
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3.0}})
|
||||
assert len(documents) == 4
|
||||
|
||||
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3}})
|
||||
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3.0}})
|
||||
assert len(documents) == 1
|
||||
|
||||
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3}})
|
||||
documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3.0}})
|
||||
assert len(documents) == 2
|
||||
|
||||
# Test compound filters
|
||||
@ -265,29 +265,34 @@ def test_extended_filter(document_store_with_docs):
|
||||
"name": ["filename5", "filename3"],
|
||||
}
|
||||
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
|
||||
assert documents == documents_simplified_filter
|
||||
# Order of returned documents might differ
|
||||
assert len(documents) == len(documents_simplified_filter) and all(
|
||||
doc in documents_simplified_filter for doc in documents
|
||||
)
|
||||
|
||||
filters = {
|
||||
"$and": {
|
||||
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||||
"$or": {"name": {"$in": ["filename5", "filename3"]}, "numeric_field": {"$lte": 5}},
|
||||
"$or": {"name": {"$in": ["filename5", "filename3"]}, "numeric_field": {"$lte": 5.0}},
|
||||
}
|
||||
}
|
||||
documents = document_store_with_docs.get_all_documents(filters=filters)
|
||||
assert len(documents) == 2
|
||||
filters_simplified = {
|
||||
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||||
"$or": {"name": ["filename5", "filename3"], "numeric_field": {"$lte": 5}},
|
||||
"$or": {"name": ["filename5", "filename3"], "numeric_field": {"$lte": 5.0}},
|
||||
}
|
||||
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
|
||||
assert documents == documents_simplified_filter
|
||||
assert len(documents) == len(documents_simplified_filter) and all(
|
||||
doc in documents_simplified_filter for doc in documents
|
||||
)
|
||||
|
||||
filters = {
|
||||
"$and": {
|
||||
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||||
"$or": {
|
||||
"name": {"$in": ["filename5", "filename3"]},
|
||||
"$and": {"numeric_field": {"$lte": 5}, "$not": {"meta_field": {"$eq": "test2"}}},
|
||||
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": {"$eq": "test2"}}},
|
||||
},
|
||||
}
|
||||
}
|
||||
@ -297,11 +302,28 @@ def test_extended_filter(document_store_with_docs):
|
||||
"date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"},
|
||||
"$or": {
|
||||
"name": ["filename5", "filename3"],
|
||||
"$and": {"numeric_field": {"$lte": 5}, "$not": {"meta_field": "test2"}},
|
||||
"$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": "test2"}},
|
||||
},
|
||||
}
|
||||
documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified)
|
||||
assert documents == documents_simplified_filter
|
||||
assert len(documents) == len(documents_simplified_filter) and all(
|
||||
doc in documents_simplified_filter for doc in documents
|
||||
)
|
||||
|
||||
# Test nested logical operations within "$not", important as we apply De Morgan's laws in WeaviateDocumentstore
|
||||
filters = {
|
||||
"$not": {
|
||||
"$or": {
|
||||
"$and": {"numeric_field": {"$gt": 3.0}, "meta_field": {"$ne": "test3"}},
|
||||
"$not": {"date_field": {"$lt": "2020-01-01"}},
|
||||
}
|
||||
}
|
||||
}
|
||||
documents = document_store_with_docs.get_all_documents(filters=filters)
|
||||
docs_meta = [doc.meta["meta_field"] for doc in documents]
|
||||
assert len(documents) == 2
|
||||
assert "test3" in docs_meta
|
||||
assert "test5" in docs_meta
|
||||
|
||||
# Test same logical operator twice on same level
|
||||
filters = {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user