From 2a674eaff7d711f38db1bd57ece9bb632fb928bd Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Fri, 18 Feb 2022 08:55:17 +0100 Subject: [PATCH] Support more data types and extended filters in WeaviateDocStore (#2143) * Support more data types and extended filters in WeaviateDocStore * Adapt types to extended filters * Update Documentation & Code Style * Fix mypy * Fix type of filters * Update Documentation & Code Style * Add Docstrings for BaseDocStore * Update Documentation & Code Style * Add + prettify DocStrings * Update Documentation & Code Style * Fix types * Update Documentation & Code Style * Remove import of TypedDict * Fix tests * Update Documentation & Code Style * Fix circular import * Fix inversion of not operation + add test case * Fix mypy * Update Documentation & Code Style * Apply black * Use convert_date_to_rfc3339 instead of datetime.fromisoformat Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/_src/api/api/document_store.md | 771 +++++++++++++++++++--- docs/_src/api/openapi/openapi.json | 2 +- haystack/document_stores/base.py | 172 ++++- haystack/document_stores/deepsetcloud.py | 203 +++++- haystack/document_stores/elasticsearch.py | 72 +- haystack/document_stores/faiss.py | 16 +- haystack/document_stores/filter_utils.py | 222 ++++++- haystack/document_stores/memory.py | 22 +- haystack/document_stores/milvus.py | 18 +- haystack/document_stores/milvus2x.py | 10 +- haystack/document_stores/sql.py | 14 +- haystack/document_stores/utils.py | 21 + haystack/document_stores/weaviate.py | 404 ++++++++++-- haystack/nodes/reader/farm.py | 2 +- haystack/nodes/retriever/base.py | 2 +- haystack/utils/deepsetcloud.py | 4 +- test/conftest.py | 16 +- test/test_document_store.py | 46 +- 18 files changed, 1699 insertions(+), 318 deletions(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 908c05208..16d1a33e0 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -61,7 +61,7 @@ None ```python @abstractmethod -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Get documents from the document store. @@ -70,8 +70,32 @@ Get documents from the document store. - `index`: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. -- `filters`: Optional filters to narrow down the documents to return. -Example: {"name": ["some", "more"], "category": ["only_one"]} +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` - `return_embedding`: Whether to return the document embeddings. - `batch_size`: Number of documents that are passed to bulk function at a time. - `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) @@ -82,7 +106,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} ```python @abstractmethod -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] ``` Get documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -94,8 +118,32 @@ a large number of documents without having to load all documents in memory. - `index`: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. -- `filters`: Optional filters to narrow down the documents to return. -Example: {"name": ["some", "more"], "category": ["only_one"]} +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + +__Example__: +```python +filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } +} +``` - `return_embedding`: Whether to return the document embeddings. - `batch_size`: When working with large number of documents, batching can help reduce memory footprint. - `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) @@ -105,7 +153,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### get\_all\_labels\_aggregated ```python -def get_all_labels_aggregated(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, open_domain: bool = True, drop_negative_labels: bool = False, drop_no_answers: bool = False, aggregate_by_meta: Optional[Union[str, list]] = None, headers: Optional[Dict[str, str]] = None) -> List[MultiLabel] +def get_all_labels_aggregated(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, open_domain: bool = True, drop_negative_labels: bool = False, drop_no_answers: bool = False, aggregate_by_meta: Optional[Union[str, list]] = None, headers: Optional[Dict[str, str]] = None) -> List[MultiLabel] ``` Return all labels in the DocumentStore, aggregated into MultiLabel objects. @@ -125,8 +173,32 @@ object, provided that they have the same product_id (to be found in Label.meta[" - `index`: Name of the index to get the labels from. If None, the DocumentStore's default index (self.index) will be used. -- `filters`: Optional filters to narrow down the labels to return. -Example: {"name": ["some", "more"], "category": ["only_one"]} +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` - `open_domain`: When True, labels are aggregated purely based on the question text alone. When False, labels are aggregated in a closed domain fashion based on the question text and also the id of the document that the label is tied to. In this setting, this function @@ -216,7 +288,7 @@ Base class for implementing Document Stores that support keyword searches. ```python @abstractmethod -def query(query: Optional[str], filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -226,7 +298,69 @@ that are most relevant to the query as defined by keyword matching algorithms li **Arguments**: - `query`: The query -- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` - `top_k`: How many documents to return per query. - `custom_query`: Custom query to be executed. - `index`: The name of the index in the DocumentStore from which to retrieve documents @@ -280,7 +414,7 @@ to performance issues. Note that Elasticsearch limits the number of results to 1 #### get\_metadata\_values\_by\_key ```python -def get_metadata_values_by_key(key: str, query: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[dict] +def get_metadata_values_by_key(key: str, query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[dict] ``` Get values associated with a metadata key. The output is in the format: @@ -302,7 +436,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. -Example: + __Example__: ```python filters = { "$and": { @@ -399,7 +533,7 @@ Update the metadata dictionary of a document by specifying its string id #### get\_document\_count ```python -def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int ``` Return the number of documents in the document store. @@ -419,7 +553,7 @@ Return the number of labels in the document store #### get\_embedding\_count ```python -def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) -> int +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) -> int ``` Return the count of embeddings in the document store. @@ -429,7 +563,7 @@ Return the count of embeddings in the document store. #### get\_all\_documents ```python -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Get documents from the document store. @@ -449,7 +583,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. -Example: + __Example__: ```python filters = { "$and": { @@ -473,7 +607,7 @@ Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-c #### get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] ``` Get documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -496,7 +630,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. -Example: + __Example__: ```python filters = { "$and": { @@ -520,7 +654,7 @@ Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-c #### get\_all\_labels ```python -def get_all_labels(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) -> List[Label] +def get_all_labels(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) -> List[Label] ``` Return all labels in the document store @@ -530,7 +664,7 @@ Return all labels in the document store #### query ```python -def query(query: Optional[str], filters: Optional[Dict[str, Any]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -552,7 +686,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. -Example: + __Example__: ```python filters = { "$and": { @@ -577,10 +711,10 @@ Example: } ``` -To use the same logical operator multiple times on the same level, logical operators take -optionally a list of dictionaries as value. + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. -Example: + __Example__: ```python filters = { "$or": [ @@ -677,7 +811,7 @@ Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-c #### query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -697,7 +831,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. -Example: + __Example__: ```python filters = { "$and": { @@ -722,10 +856,10 @@ Example: } ``` -To use the same logical operator multiple times on the same level, logical operators take -optionally a list of dictionaries as value. + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. -Example: + __Example__: ```python filters = { "$or": [ @@ -769,7 +903,7 @@ Return a summary of the documents in the document store #### update\_embeddings ```python -def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) +def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) ``` Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -795,7 +929,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. -Example: + __Example__: ```python filters = { "$and": { @@ -822,7 +956,7 @@ None #### delete\_all\_documents ```python -def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -841,7 +975,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. -Example: + __Example__: ```python filters = { "$and": { @@ -867,7 +1001,7 @@ None #### delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -888,7 +1022,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. -Example: + __Example__: ```python filters = { "$and": { @@ -903,9 +1037,9 @@ Example: } ``` -If filters are provided along with a list of IDs, this method deletes the -intersection of the two query results (documents that match the filters and -have their ID in the list). + If filters are provided along with a list of IDs, this method deletes the + intersection of the two query results (documents that match the filters and + have their ID in the list). - `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. @@ -918,7 +1052,7 @@ None #### delete\_labels ```python -def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete labels in an index. All labels are deleted if no filters are passed. @@ -939,7 +1073,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. -Example: + __Example__: ```python filters = { "$and": { @@ -991,7 +1125,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore) #### query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -1011,7 +1145,7 @@ If no logical operator is provided, `"$and"` is used as default operation. If no operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. -Example: + __Example__: ```python filters = { "$and": { @@ -1036,10 +1170,10 @@ Example: } ``` -To use the same logical operator multiple times on the same level, logical operators take -optionally a list of dictionaries as value. + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. -Example: + __Example__: ```python filters = { "$or": [ @@ -1185,7 +1319,7 @@ Calculate similarity scores between query embedding and a list of documents usin #### query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -1204,7 +1338,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### update\_embeddings ```python -def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000) +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000) ``` Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -1232,7 +1366,7 @@ None #### get\_document\_count ```python -def get_document_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int ``` Return the number of documents in the document store. @@ -1262,7 +1396,7 @@ Return the number of labels in the document store. #### get\_all\_documents ```python -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Get all documents from the document store as a list. @@ -1280,7 +1414,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] ``` Get all documents from the document store. The methods returns a Python Generator that yields individual @@ -1300,7 +1434,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### get\_all\_labels ```python -def get_all_labels(index: str = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) -> List[Label] +def get_all_labels(index: str = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) -> List[Label] ``` Return all labels in the document store. @@ -1310,7 +1444,7 @@ Return all labels in the document store. #### delete\_all\_documents ```python -def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -1329,7 +1463,7 @@ None #### delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -1354,7 +1488,7 @@ None #### delete\_labels ```python -def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete labels in an index. All labels are deleted if no filters are passed. @@ -1418,7 +1552,7 @@ Fetch documents by specifying a list of text vector id strings #### get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] ``` Get documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -1527,7 +1661,7 @@ Update the metadata dictionary of a document by specifying its string id #### get\_document\_count ```python -def get_document_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int ``` Return the number of documents in the document store. @@ -1547,7 +1681,7 @@ Return the number of labels in the document store #### delete\_all\_documents ```python -def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -1566,7 +1700,7 @@ None #### delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -1591,7 +1725,7 @@ None #### delete\_labels ```python -def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete labels from the document store. All labels are deleted if no filters are passed. @@ -1664,7 +1798,7 @@ None #### update\_embeddings ```python -def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, List[str]]] = None, batch_size: int = 10_000) +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None, batch_size: int = 10_000) ``` Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -1692,7 +1826,7 @@ None #### get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] ``` Get all documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -1714,7 +1848,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### get\_embedding\_count ```python -def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int ``` Return the count of embeddings in the document store. @@ -1747,7 +1881,7 @@ None #### delete\_all\_documents ```python -def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete all documents from the document store. @@ -1757,7 +1891,7 @@ Delete all documents from the document store. #### delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete documents from the document store. All documents are deleted if no filters are passed. @@ -1782,7 +1916,7 @@ None #### query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -1902,7 +2036,7 @@ None #### update\_embeddings ```python -def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, filters: Optional[Dict[str, List[str]]] = None) +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None) ``` Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -1930,7 +2064,7 @@ None #### query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[dict] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -1953,7 +2087,7 @@ list of Documents that are the most similar to `query_emb` #### delete\_all\_documents ```python -def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete all documents (from SQL AND Milvus). @@ -1973,7 +2107,7 @@ None #### delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -1998,7 +2132,7 @@ None #### get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] ``` Get all documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -2020,7 +2154,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### get\_all\_documents ```python -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Get documents from the document store (optionally using filter criteria). @@ -2091,7 +2225,7 @@ List[np.array]: List of vectors. #### get\_embedding\_count ```python -def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int ``` Return the count of embeddings in the document store. @@ -2183,27 +2317,29 @@ None #### update\_document\_meta ```python -def update_document_meta(id: str, meta: Dict[str, str], index: str = None) +def update_document_meta(id: str, meta: Dict[str, Union[List, str, int, float, bool]], index: str = None) ``` Update the metadata dictionary of a document by specifying its string id. +Overwrites only the specified fields, the unspecified ones remain unchanged. #### get\_embedding\_count ```python -def get_embedding_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int +def get_embedding_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None) -> int ``` -Return the number of embeddings in the document store, which is the same as the number of documents since every document has a default embedding +Return the number of embeddings in the document store, which is the same as the number of documents since +every document has a default embedding. #### get\_document\_count ```python -def get_document_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int ``` Return the number of documents in the document store. @@ -2213,7 +2349,7 @@ Return the number of documents in the document store. #### get\_all\_documents ```python -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Get documents from the document store. @@ -2222,8 +2358,32 @@ Get documents from the document store. - `index`: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. -- `filters`: Optional filters to narrow down the documents to return. -Example: {"name": ["some", "more"], "category": ["only_one"]} +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` - `return_embedding`: Whether to return the document embeddings. - `batch_size`: When working with large number of documents, batching can help reduce memory footprint. @@ -2232,7 +2392,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] ``` Get documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -2244,8 +2404,32 @@ a large number of documents without having to load all documents in memory. - `index`: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. -- `filters`: Optional filters to narrow down the documents to return. -Example: {"name": ["some", "more"], "category": ["only_one"]} +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` - `return_embedding`: Whether to return the document embeddings. - `batch_size`: When working with large number of documents, batching can help reduce memory footprint. @@ -2254,7 +2438,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### query ```python -def query(query: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None) -> List[Document] +def query(query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -2264,7 +2448,69 @@ that are most relevant to the query as defined by Weaviate semantic search. **Arguments**: - `query`: The query -- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` - `top_k`: How many documents to return per query. - `custom_query`: Custom query that will executed using query.raw method, for more details refer https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html @@ -2275,7 +2521,7 @@ https://www.semi.technology/developers/weaviate/current/graphql-references/filte #### query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[dict] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -2283,8 +2529,69 @@ Find the document that is most similar to the provided `query_emb` by using a ve **Arguments**: - `query_emb`: Embedding of the query (e.g. gathered from DPR) -- `filters`: Optional filters to narrow down the search space. -Example: {"name": ["some", "more"], "category": ["only_one"]} +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` - `top_k`: How many documents to return - `index`: index name for storing the docs and metadata - `return_embedding`: To return document embedding @@ -2294,7 +2601,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### update\_embeddings ```python -def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000) +def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000) ``` Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -2307,8 +2614,32 @@ This can be useful if want to change the embeddings for your documents (e.g. aft - `index`: Index name to update - `update_existing_embeddings`: Weaviate mandates an embedding while creating the document itself. This option must be always true for weaviate and it will update the embeddings for all the documents. -- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. -Example: {"name": ["some", "more"], "category": ["only_one"]} +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` - `batch_size`: When working with large number of documents, batching can help reduce memory footprint. **Returns**: @@ -2320,7 +2651,7 @@ None #### delete\_all\_documents ```python -def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -2328,7 +2659,32 @@ Delete documents in an index. All documents are deleted if no filters are passed **Arguments**: - `index`: Index name to delete the document from. -- `filters`: Optional filters to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` **Returns**: @@ -2339,7 +2695,7 @@ None #### delete\_documents ```python -def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None, headers: Optional[Dict[str, str]] = None) +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) ``` Delete documents in an index. All documents are deleted if no filters are passed. @@ -2349,11 +2705,35 @@ Delete documents in an index. All documents are deleted if no filters are passed - `index`: Index name to delete the document from. If None, the DocumentStore's default index (self.index) will be used. - `ids`: Optional list of IDs to narrow down the documents to be deleted. -- `filters`: Optional filters to narrow down the documents to be deleted. -Example filters: {"name": ["some", "more"], "category": ["only_one"]}. -If filters are provided along with a list of IDs, this method deletes the -intersection of the two query results (documents that match the filters and -have their ID in the list). +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + If filters are provided along with a list of IDs, this method deletes the + intersection of the two query results (documents that match the filters and + have their ID in the list). **Returns**: @@ -2532,7 +2912,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore) #### get\_all\_documents ```python -def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Get documents from the document store. @@ -2541,8 +2921,32 @@ Get documents from the document store. - `index`: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. -- `filters`: Optional filters to narrow down the documents to return. -Example: {"name": ["some", "more"], "category": ["only_one"]} +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` - `return_embedding`: Whether to return the document embeddings. - `batch_size`: Number of documents that are passed to bulk function at a time. - `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) @@ -2552,7 +2956,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### get\_all\_documents\_generator ```python -def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] ``` Get documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -2564,8 +2968,32 @@ a large number of documents without having to load all documents in memory. - `index`: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. -- `filters`: Optional filters to narrow down the documents to return. -Example: {"name": ["some", "more"], "category": ["only_one"]} +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` - `return_embedding`: Whether to return the document embeddings. - `batch_size`: When working with large number of documents, batching can help reduce memory footprint. - `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) @@ -2575,7 +3003,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### query\_by\_embedding ```python -def query_by_embedding(query_emb: np.ndarray, filters: Optional[Optional[Dict[str, List[str]]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -2583,8 +3011,69 @@ Find the document that is most similar to the provided `query_emb` by using a ve **Arguments**: - `query_emb`: Embedding of the query (e.g. gathered from DPR) -- `filters`: Optional filters to narrow down the search space. -Example: {"name": ["some", "more"], "category": ["only_one"]} +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` - `top_k`: How many documents to return - `index`: Index name for storing the docs and metadata - `return_embedding`: To return document embedding @@ -2595,7 +3084,7 @@ Example: {"name": ["some", "more"], "category": ["only_one"]} #### query ```python -def query(query: Optional[str], filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -2605,7 +3094,69 @@ that are most relevant to the query as defined by the BM25 algorithm. **Arguments**: - `query`: The query -- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` - `top_k`: How many documents to return per query. - `custom_query`: Custom query to be executed. - `index`: The name of the index in the DocumentStore from which to retrieve documents diff --git a/docs/_src/api/openapi/openapi.json b/docs/_src/api/openapi/openapi.json index f5fcee5b7..12ddf0ba0 100644 --- a/docs/_src/api/openapi/openapi.json +++ b/docs/_src/api/openapi/openapi.json @@ -2,7 +2,7 @@ "openapi": "3.0.2", "info": { "title": "Haystack REST API", - "version": "1.1.0" + "version": "1.2.0rc0" }, "paths": { "/initialized": { diff --git a/haystack/document_stores/base.py b/haystack/document_stores/base.py index 058deb992..9c6f78e67 100644 --- a/haystack/document_stores/base.py +++ b/haystack/document_stores/base.py @@ -98,7 +98,7 @@ class BaseDocumentStore(BaseComponent): def get_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -108,8 +108,33 @@ class BaseDocumentStore(BaseComponent): :param index: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. - :param filters: Optional filters to narrow down the documents to return. - Example: {"name": ["some", "more"], "category": ["only_one"]} + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + :param return_embedding: Whether to return the document embeddings. :param batch_size: Number of documents that are passed to bulk function at a time. :param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) @@ -120,7 +145,7 @@ class BaseDocumentStore(BaseComponent): def get_all_documents_generator( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -132,8 +157,33 @@ class BaseDocumentStore(BaseComponent): :param index: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. - :param filters: Optional filters to narrow down the documents to return. - Example: {"name": ["some", "more"], "category": ["only_one"]} + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + :param return_embedding: Whether to return the document embeddings. :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) @@ -158,7 +208,7 @@ class BaseDocumentStore(BaseComponent): def get_all_labels( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ) -> List[Label]: pass @@ -166,7 +216,7 @@ class BaseDocumentStore(BaseComponent): def get_all_labels_aggregated( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, open_domain: bool = True, drop_negative_labels: bool = False, drop_no_answers: bool = False, @@ -188,8 +238,33 @@ class BaseDocumentStore(BaseComponent): :param index: Name of the index to get the labels from. If None, the DocumentStore's default index (self.index) will be used. - :param filters: Optional filters to narrow down the labels to return. - Example: {"name": ["some", "more"], "category": ["only_one"]} + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + :param open_domain: When True, labels are aggregated purely based on the question text alone. When False, labels are aggregated in a closed domain fashion based on the question text and also the id of the document that the label is tied to. In this setting, this function @@ -260,7 +335,7 @@ class BaseDocumentStore(BaseComponent): @abstractmethod def get_document_count( self, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None, @@ -299,7 +374,7 @@ class BaseDocumentStore(BaseComponent): def query_by_embedding( self, query_emb: np.ndarray, - filters: Optional[Optional[Dict[str, List[str]]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, @@ -413,7 +488,7 @@ class BaseDocumentStore(BaseComponent): def delete_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ): pass @@ -423,7 +498,7 @@ class BaseDocumentStore(BaseComponent): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ): pass @@ -433,7 +508,7 @@ class BaseDocumentStore(BaseComponent): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ): pass @@ -564,7 +639,7 @@ class KeywordDocumentStore(BaseDocumentStore): def query( self, query: Optional[str], - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, @@ -575,7 +650,70 @@ class KeywordDocumentStore(BaseDocumentStore): that are most relevant to the query as defined by keyword matching algorithms like BM25. :param query: The query - :param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` + :param top_k: How many documents to return per query. :param custom_query: Custom query to be executed. :param index: The name of the index in the DocumentStore from which to retrieve documents diff --git a/haystack/document_stores/deepsetcloud.py b/haystack/document_stores/deepsetcloud.py index d806420a2..3a76f2313 100644 --- a/haystack/document_stores/deepsetcloud.py +++ b/haystack/document_stores/deepsetcloud.py @@ -77,7 +77,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): def get_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -87,8 +87,32 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): :param index: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. - :param filters: Optional filters to narrow down the documents to return. - Example: {"name": ["some", "more"], "category": ["only_one"]} + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` :param return_embedding: Whether to return the document embeddings. :param batch_size: Number of documents that are passed to bulk function at a time. :param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) @@ -106,7 +130,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): def get_all_documents_generator( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -118,8 +142,32 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): :param index: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. - :param filters: Optional filters to narrow down the documents to return. - Example: {"name": ["some", "more"], "category": ["only_one"]} + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` :param return_embedding: Whether to return the document embeddings. :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) @@ -168,7 +216,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): def get_document_count( self, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None, @@ -184,7 +232,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): def query_by_embedding( self, query_emb: np.ndarray, - filters: Optional[Optional[Dict[str, List[str]]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, @@ -194,8 +242,69 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. :param query_emb: Embedding of the query (e.g. gathered from DPR) - :param filters: Optional filters to narrow down the search space. - Example: {"name": ["some", "more"], "category": ["only_one"]} + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` :param top_k: How many documents to return :param index: Index name for storing the docs and metadata :param return_embedding: To return document embedding @@ -220,7 +329,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): def query( self, query: Optional[str], - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, @@ -231,7 +340,69 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): that are most relevant to the query as defined by the BM25 algorithm. :param query: The query - :param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` :param top_k: How many documents to return per query. :param custom_query: Custom query to be executed. :param index: The name of the index in the DocumentStore from which to retrieve documents @@ -280,7 +451,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): def get_all_labels( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ) -> List[Label]: raise NotImplementedError("DeepsetCloudDocumentStore currently does not support labels.") @@ -299,7 +470,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): def delete_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ): raise NotImplementedError("DeepsetCloudDocumentStore currently does not support deleting documents.") @@ -308,7 +479,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ): raise NotImplementedError("DeepsetCloudDocumentStore currently does not support deleting documents.") @@ -317,7 +488,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ): raise NotImplementedError("DeepsetCloudDocumentStore currently does not support labels.") diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index 563f4a9ed..67a45e722 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -478,7 +478,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): self, key: str, query: Optional[str] = None, - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, ) -> List[dict]: @@ -499,7 +499,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. - Example: + __Example__: ```python filters = { "$and": { @@ -705,7 +705,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): def get_document_count( self, - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None, @@ -736,7 +736,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): def get_embedding_count( self, index: Optional[str] = None, - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ) -> int: """ @@ -756,7 +756,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): def get_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -777,7 +777,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. - Example: + __Example__: ```python filters = { "$and": { @@ -805,7 +805,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): def get_all_documents_generator( self, index: Optional[str] = None, - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -828,7 +828,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. - Example: + __Example__: ```python filters = { "$and": { @@ -862,7 +862,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): def get_all_labels( self, index: Optional[str] = None, - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000, ) -> List[Label]: @@ -879,7 +879,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): def _get_all_documents_in_index( self, index: str, - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, batch_size: int = 10_000, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None, @@ -901,7 +901,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): def query( self, query: Optional[str], - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, @@ -924,7 +924,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. - Example: + __Example__: ```python filters = { "$and": { @@ -949,10 +949,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): } ``` - To use the same logical operator multiple times on the same level, logical operators take - optionally a list of dictionaries as value. + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. - Example: + __Example__: ```python filters = { "$or": [ @@ -1105,7 +1105,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): def query_by_embedding( self, query_emb: np.ndarray, - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, @@ -1127,7 +1127,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. - Example: + __Example__: ```python filters = { "$and": { @@ -1152,10 +1152,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): } ``` - To use the same logical operator multiple times on the same level, logical operators take - optionally a list of dictionaries as value. + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. - Example: + __Example__: ```python filters = { "$or": [ @@ -1348,7 +1348,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): self, retriever, index: Optional[str] = None, - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -1374,7 +1374,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. - Example: + __Example__: ```python filters = { "$and": { @@ -1449,7 +1449,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): def delete_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ): """ @@ -1467,7 +1467,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. - Example: + __Example__: ```python filters = { "$and": { @@ -1497,7 +1497,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ): """ @@ -1517,7 +1517,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. - Example: + __Example__: ```python filters = { "$and": { @@ -1532,9 +1532,9 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): } ``` - If filters are provided along with a list of IDs, this method deletes the - intersection of the two query results (documents that match the filters and - have their ID in the list). + If filters are provided along with a list of IDs, this method deletes the + intersection of the two query results (documents that match the filters and + have their ID in the list). :param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. :return: None @@ -1560,7 +1560,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ): """ @@ -1580,7 +1580,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. - Example: + __Example__: ```python filters = { "$and": { @@ -1695,7 +1695,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore): def query_by_embedding( self, query_emb: np.ndarray, - filters: Optional[Dict[str, Any]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, @@ -1717,7 +1717,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore): operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default operation. - Example: + __Example__: ```python filters = { "$and": { @@ -1742,10 +1742,10 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore): } ``` - To use the same logical operator multiple times on the same level, logical operators take - optionally a list of dictionaries as value. + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. - Example: + __Example__: ```python filters = { "$or": [ diff --git a/haystack/document_stores/faiss.py b/haystack/document_stores/faiss.py index 757bc4e6a..ea64fbede 100644 --- a/haystack/document_stores/faiss.py +++ b/haystack/document_stores/faiss.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from haystack.nodes.retriever import BaseRetriever @@ -308,7 +308,7 @@ class FAISSDocumentStore(SQLDocumentStore): retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore batch_size: int = 10_000, ): """ @@ -379,7 +379,7 @@ class FAISSDocumentStore(SQLDocumentStore): def get_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -396,7 +396,7 @@ class FAISSDocumentStore(SQLDocumentStore): def get_all_documents_generator( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -447,7 +447,7 @@ class FAISSDocumentStore(SQLDocumentStore): doc.embedding = self.faiss_indexes[index].reconstruct(int(doc.meta["vector_id"])) return documents - def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int: + def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int: """ Return the count of embeddings in the document store. """ @@ -486,7 +486,7 @@ class FAISSDocumentStore(SQLDocumentStore): def delete_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore headers: Optional[Dict[str, str]] = None, ): """ @@ -507,7 +507,7 @@ class FAISSDocumentStore(SQLDocumentStore): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore headers: Optional[Dict[str, str]] = None, ): """ @@ -546,7 +546,7 @@ class FAISSDocumentStore(SQLDocumentStore): def query_by_embedding( self, query_emb: np.ndarray, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in FAISSDocStore top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, diff --git a/haystack/document_stores/filter_utils.py b/haystack/document_stores/filter_utils.py index a45b98734..6b97a0f6b 100644 --- a/haystack/document_stores/filter_utils.py +++ b/haystack/document_stores/filter_utils.py @@ -1,9 +1,11 @@ -from typing import Union, List, Dict +from typing import Union, List, Dict, Optional, Tuple from abc import ABC, abstractmethod from collections import defaultdict +from haystack.document_stores.utils import convert_date_to_rfc3339 -def nested_defaultdict(): + +def nested_defaultdict() -> defaultdict: """ Data structure that recursively adds a dictionary as value if a key does not exist. Advantage: In nested dictionary structures, we don't need to check if a key already exists (which can become hard to maintain in nested dictionaries @@ -81,17 +83,17 @@ class LogicalFilterClause(ABC): """ - def __init__(self, conditions: List["LogicalFilterClause"]): + def __init__(self, conditions: List[Union["LogicalFilterClause", "ComparisonOperation"]]): self.conditions = conditions @classmethod - def parse(cls, filter_term: Union[dict, List[dict]]): + def parse(cls, filter_term: Union[dict, List[dict]]) -> Union["LogicalFilterClause", "ComparisonOperation"]: """ Parses a filter dictionary/list and returns a LogicalFilterClause instance. :param filter_term: Dictionary or list that contains the filter definition. """ - conditions = [] + conditions: List[Union[LogicalFilterClause, ComparisonOperation]] = [] if isinstance(filter_term, dict): filter_term = [filter_term] @@ -122,7 +124,14 @@ class LogicalFilterClause(ABC): """ pass - def _merge_es_range_queries(self, conditions: List[Dict]) -> List[Dict]: + @abstractmethod + def convert_to_weaviate(self): + """ + Converts the LogicalFilterClause instance to a Weaviate filter. + """ + pass + + def _merge_es_range_queries(self, conditions: List[Dict]) -> List[Dict[str, Dict]]: """ Merges Elasticsearch range queries that perform on the same metadata field. """ @@ -142,14 +151,23 @@ class LogicalFilterClause(ABC): return conditions + @abstractmethod + def invert(self) -> Union["LogicalFilterClause", "ComparisonOperation"]: + """ + Inverts the LogicalOperation instance. + Necessary for Weaviate as Weaviate doesn't seem to support the 'Not' operator anymore. + (https://github.com/semi-technologies/weaviate/issues/1717) + """ + pass + class ComparisonOperation(ABC): - def __init__(self, field_name: str, comparison_value: Union[str, float, List]): + def __init__(self, field_name: str, comparison_value: Union[str, int, float, bool, List]): self.field_name = field_name self.comparison_value = comparison_value @classmethod - def parse(cls, field_name, comparison_clause: Union[Dict, List, str, float]): + def parse(cls, field_name, comparison_clause: Union[Dict, List, str, float]) -> List["ComparisonOperation"]: comparison_operations: List[ComparisonOperation] = [] if isinstance(comparison_clause, dict): @@ -187,107 +205,273 @@ class ComparisonOperation(ABC): """ pass + @abstractmethod + def convert_to_weaviate(self): + """ + Converts the ComparisonOperation instance to a Weaviate comparison operator. + """ + pass + + @abstractmethod + def invert(self) -> "ComparisonOperation": + """ + Inverts the ComparisonOperation. + Necessary for Weaviate as Weaviate doesn't seem to support the 'Not' operator anymore. + (https://github.com/semi-technologies/weaviate/issues/1717) + """ + pass + + def _get_weaviate_datatype( + self, value: Optional[Union[str, int, float, bool]] = None + ) -> Tuple[str, Union[str, int, float, bool]]: + """ + Determines the type of the comparison value and converts it to RFC3339 format if it is as date, + as Weaviate requires dates to be in RFC3339 format including the time and timezone. + + """ + if value is None: + assert not isinstance(self.comparison_value, list) # Necessary for mypy + value = self.comparison_value + + if isinstance(value, str): + # Check if comparison value is a date + try: + value = convert_date_to_rfc3339(value) + data_type = "valueDate" + # Comparison value is a plain string + except ValueError: + data_type = "valueString" + elif isinstance(value, int): + data_type = "valueInt" + elif isinstance(value, float): + data_type = "valueNumber" + elif isinstance(value, bool): + data_type = "valueBoolean" + else: + raise ValueError( + f"Unsupported data type of comparison value for {self.__class__.__name__}." + f"Value needs to be of type str, int, float, or bool." + ) + + return data_type, value + class NotOperation(LogicalFilterClause): """ Handles conversion of logical 'NOT' operations. """ - def convert_to_elasticsearch(self): + def convert_to_elasticsearch(self) -> Dict[str, Dict]: conditions = [condition.convert_to_elasticsearch() for condition in self.conditions] conditions = self._merge_es_range_queries(conditions) return {"bool": {"must_not": conditions}} + def convert_to_weaviate(self) -> Dict[str, Union[str, int, float, bool, List[Dict]]]: + conditions = [condition.invert().convert_to_weaviate() for condition in self.conditions] + if len(conditions) > 1: + # Conditions in self.conditions are by default combined with AND which becomes OR according to DeMorgan + return {"operator": "Or", "operands": conditions} + else: + return conditions[0] + + def invert(self) -> Union[LogicalFilterClause, ComparisonOperation]: + # This method is called when a "$not" operation is embedded in another "$not" operation. Therefore, we don't + # invert the operations here, as two "$not" operation annihilate each other. + # (If we have more than one condition, we return an AndOperation, the default logical operation for combining + # multiple conditions.) + if len(self.conditions) > 1: + return AndOperation(self.conditions) + else: + return self.conditions[0] + class AndOperation(LogicalFilterClause): """ Handles conversion of logical 'AND' operations. """ - def convert_to_elasticsearch(self): + def convert_to_elasticsearch(self) -> Dict[str, Dict]: conditions = [condition.convert_to_elasticsearch() for condition in self.conditions] conditions = self._merge_es_range_queries(conditions) return {"bool": {"must": conditions}} + def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: + conditions = [condition.convert_to_weaviate() for condition in self.conditions] + return {"operator": "And", "operands": conditions} + + def invert(self) -> "OrOperation": + return OrOperation([condition.invert() for condition in self.conditions]) + class OrOperation(LogicalFilterClause): """ Handles conversion of logical 'OR' operations. """ - def convert_to_elasticsearch(self): + def convert_to_elasticsearch(self) -> Dict[str, Dict]: conditions = [condition.convert_to_elasticsearch() for condition in self.conditions] conditions = self._merge_es_range_queries(conditions) return {"bool": {"should": conditions}} + def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: + conditions = [condition.convert_to_weaviate() for condition in self.conditions] + return {"operator": "Or", "operands": conditions} + + def invert(self) -> AndOperation: + return AndOperation([condition.invert() for condition in self.conditions]) + class EqOperation(ComparisonOperation): """ Handles conversion of the '$eq' comparison operation. """ - def convert_to_elasticsearch(self): + def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Union[str, int, float, bool]]]: + assert not isinstance(self.comparison_value, list), "Use '$in' operation for lists as comparison values." return {"term": {self.field_name: self.comparison_value}} + def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, bool]]: + comp_value_type, comp_value = self._get_weaviate_datatype() + return {"path": [self.field_name], "operator": "Equal", comp_value_type: comp_value} + + def invert(self) -> "NeOperation": + return NeOperation(self.field_name, self.comparison_value) + class InOperation(ComparisonOperation): """ Handles conversion of the '$in' comparison operation. """ - def convert_to_elasticsearch(self): + def convert_to_elasticsearch(self) -> Dict[str, Dict[str, List]]: + assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list." return {"terms": {self.field_name: self.comparison_value}} + def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: + filter_dict: Dict[str, Union[str, List[Dict]]] = {"operator": "Or", "operands": []} + assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list." + for value in self.comparison_value: + comp_value_type, comp_value = self._get_weaviate_datatype(value) + assert isinstance(filter_dict["operands"], list) # Necessary for mypy + filter_dict["operands"].append( + {"path": [self.field_name], "operator": "Equal", comp_value_type: comp_value} + ) + + return filter_dict + + def invert(self) -> "NinOperation": + return NinOperation(self.field_name, self.comparison_value) + class NeOperation(ComparisonOperation): """ Handles conversion of the '$ne' comparison operation. """ - def convert_to_elasticsearch(self): + def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Dict[str, Union[str, int, float, bool]]]]]: + assert not isinstance(self.comparison_value, list), "Use '$nin' operation for lists as comparison values." return {"bool": {"must_not": {"term": {self.field_name: self.comparison_value}}}} + def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, bool]]: + comp_value_type, comp_value = self._get_weaviate_datatype() + return {"path": [self.field_name], "operator": "NotEqual", comp_value_type: comp_value} + + def invert(self) -> "EqOperation": + return EqOperation(self.field_name, self.comparison_value) + class NinOperation(ComparisonOperation): """ Handles conversion of the '$nin' comparison operation. """ - def convert_to_elasticsearch(self): + def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Dict[str, List]]]]: + assert isinstance(self.comparison_value, list), "'$nin' operation requires comparison value to be a list." return {"bool": {"must_not": {"terms": {self.field_name: self.comparison_value}}}} + def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: + filter_dict: Dict[str, Union[str, List[Dict]]] = {"operator": "And", "operands": []} + assert isinstance(self.comparison_value, list), "'$nin' operation requires comparison value to be a list." + for value in self.comparison_value: + comp_value_type, comp_value = self._get_weaviate_datatype(value) + assert isinstance(filter_dict["operands"], list) # Necessary for mypy + filter_dict["operands"].append( + {"path": [self.field_name], "operator": "NotEqual", comp_value_type: comp_value} + ) + + return filter_dict + + def invert(self) -> "InOperation": + return InOperation(self.field_name, self.comparison_value) + class GtOperation(ComparisonOperation): """ Handles conversion of the '$gt' comparison operation. """ - def convert_to_elasticsearch(self): + def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]: + assert not isinstance(self.comparison_value, list), "Comparison value for '$gt' operation must not be a list." return {"range": {self.field_name: {"gt": self.comparison_value}}} + def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: + comp_value_type, comp_value = self._get_weaviate_datatype() + assert not isinstance(comp_value, list), "Comparison value for '$gt' operation must not be a list." + return {"path": [self.field_name], "operator": "GreaterThan", comp_value_type: comp_value} + + def invert(self) -> "LteOperation": + return LteOperation(self.field_name, self.comparison_value) + class GteOperation(ComparisonOperation): """ Handles conversion of the '$gte' comparison operation. """ - def convert_to_elasticsearch(self): + def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]: + assert not isinstance(self.comparison_value, list), "Comparison value for '$gte' operation must not be a list." return {"range": {self.field_name: {"gte": self.comparison_value}}} + def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: + comp_value_type, comp_value = self._get_weaviate_datatype() + assert not isinstance(comp_value, list), "Comparison value for '$gte' operation must not be a list." + return {"path": [self.field_name], "operator": "GreaterThanEqual", comp_value_type: comp_value} + + def invert(self) -> "LtOperation": + return LtOperation(self.field_name, self.comparison_value) + class LtOperation(ComparisonOperation): """ Handles conversion of the '$lt' comparison operation. """ - def convert_to_elasticsearch(self): + def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]: + assert not isinstance(self.comparison_value, list), "Comparison value for '$lt' operation must not be a list." return {"range": {self.field_name: {"lt": self.comparison_value}}} + def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: + comp_value_type, comp_value = self._get_weaviate_datatype() + assert not isinstance(comp_value, list), "Comparison value for '$lt' operation must not be a list." + return {"path": [self.field_name], "operator": "LessThan", comp_value_type: comp_value} + + def invert(self) -> "GteOperation": + return GteOperation(self.field_name, self.comparison_value) + class LteOperation(ComparisonOperation): """ Handles conversion of the '$lte' comparison operation. """ - def convert_to_elasticsearch(self): + def convert_to_elasticsearch(self) -> Dict[str, Dict[str, Dict[str, Union[str, float, int]]]]: + assert not isinstance(self.comparison_value, list), "Comparison value for '$lte' operation must not be a list." return {"range": {self.field_name: {"lte": self.comparison_value}}} + + def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: + comp_value_type, comp_value = self._get_weaviate_datatype() + assert not isinstance(comp_value, list), "Comparison value for '$lte' operation must not be a list." + return {"path": [self.field_name], "operator": "LessThanEqual", comp_value_type: comp_value} + + def invert(self) -> "GtOperation": + return GtOperation(self.field_name, self.comparison_value) diff --git a/haystack/document_stores/memory.py b/haystack/document_stores/memory.py index 6745ceb12..583c09e61 100644 --- a/haystack/document_stores/memory.py +++ b/haystack/document_stores/memory.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Dict, List, Optional, Union, Generator +from typing import TYPE_CHECKING, Dict, List, Optional, Union, Generator, Any if TYPE_CHECKING: from haystack.nodes.retriever import BaseRetriever @@ -291,7 +291,7 @@ class InMemoryDocumentStore(BaseDocumentStore): def query_by_embedding( self, query_emb: np.ndarray, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, @@ -337,7 +337,7 @@ class InMemoryDocumentStore(BaseDocumentStore): self, retriever: "BaseRetriever", index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore update_existing_embeddings: bool = True, batch_size: int = 10_000, ): @@ -390,7 +390,7 @@ class InMemoryDocumentStore(BaseDocumentStore): def get_document_count( self, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None, @@ -427,7 +427,7 @@ class InMemoryDocumentStore(BaseDocumentStore): def _query( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore return_embedding: Optional[bool] = None, only_documents_without_embedding: bool = False, ): @@ -464,7 +464,7 @@ class InMemoryDocumentStore(BaseDocumentStore): def get_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -490,7 +490,7 @@ class InMemoryDocumentStore(BaseDocumentStore): def get_all_documents_generator( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -514,7 +514,7 @@ class InMemoryDocumentStore(BaseDocumentStore): def get_all_labels( self, index: str = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore headers: Optional[Dict[str, str]] = None, ) -> List[Label]: """ @@ -544,7 +544,7 @@ class InMemoryDocumentStore(BaseDocumentStore): def delete_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore headers: Optional[Dict[str, str]] = None, ): """ @@ -569,7 +569,7 @@ class InMemoryDocumentStore(BaseDocumentStore): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore headers: Optional[Dict[str, str]] = None, ): """ @@ -603,7 +603,7 @@ class InMemoryDocumentStore(BaseDocumentStore): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in InMemoryDocStore headers: Optional[Dict[str, str]] = None, ): """ diff --git a/haystack/document_stores/milvus.py b/haystack/document_stores/milvus.py index dbb1beb08..242307094 100644 --- a/haystack/document_stores/milvus.py +++ b/haystack/document_stores/milvus.py @@ -307,7 +307,7 @@ class MilvusDocumentStore(SQLDocumentStore): index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore ): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -374,7 +374,7 @@ class MilvusDocumentStore(SQLDocumentStore): def query_by_embedding( self, query_emb: np.ndarray, - filters: Optional[dict] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, @@ -440,7 +440,7 @@ class MilvusDocumentStore(SQLDocumentStore): def delete_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore headers: Optional[Dict[str, str]] = None, ): """ @@ -465,7 +465,7 @@ class MilvusDocumentStore(SQLDocumentStore): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore headers: Optional[Dict[str, str]] = None, ): """ @@ -508,7 +508,7 @@ class MilvusDocumentStore(SQLDocumentStore): def get_all_documents_generator( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -541,7 +541,7 @@ class MilvusDocumentStore(SQLDocumentStore): def get_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -676,7 +676,11 @@ class MilvusDocumentStore(SQLDocumentStore): return vectors - def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int: + def get_embedding_count( + self, + index: Optional[str] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore + ) -> int: """ Return the count of embeddings in the document store. """ diff --git a/haystack/document_stores/milvus2x.py b/haystack/document_stores/milvus2x.py index bd3009795..1b28c2cfa 100644 --- a/haystack/document_stores/milvus2x.py +++ b/haystack/document_stores/milvus2x.py @@ -375,7 +375,7 @@ class Milvus2DocumentStore(SQLDocumentStore): index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore ): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -457,7 +457,7 @@ class Milvus2DocumentStore(SQLDocumentStore): def query_by_embedding( self, query_emb: np.ndarray, - filters: Optional[dict] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, @@ -538,7 +538,7 @@ class Milvus2DocumentStore(SQLDocumentStore): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore headers: Optional[Dict[str, str]] = None, ): """ @@ -571,7 +571,7 @@ class Milvus2DocumentStore(SQLDocumentStore): def get_all_documents_generator( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -604,7 +604,7 @@ class Milvus2DocumentStore(SQLDocumentStore): def get_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in Milvus2DocStore return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index 1948966d8..39fa07a85 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -215,7 +215,7 @@ class SQLDocumentStore(BaseDocumentStore): def get_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -233,7 +233,7 @@ class SQLDocumentStore(BaseDocumentStore): def get_all_documents_generator( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -271,7 +271,7 @@ class SQLDocumentStore(BaseDocumentStore): def _query( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore vector_ids: Optional[List[str]] = None, only_documents_without_embedding: bool = False, batch_size: int = 10_000, @@ -521,7 +521,7 @@ class SQLDocumentStore(BaseDocumentStore): def get_document_count( self, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None, @@ -609,7 +609,7 @@ class SQLDocumentStore(BaseDocumentStore): def delete_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore headers: Optional[Dict[str, str]] = None, ): """ @@ -634,7 +634,7 @@ class SQLDocumentStore(BaseDocumentStore): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore headers: Optional[Dict[str, str]] = None, ): """ @@ -674,7 +674,7 @@ class SQLDocumentStore(BaseDocumentStore): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in SQLDocStore headers: Optional[Dict[str, str]] = None, ): """ diff --git a/haystack/document_stores/utils.py b/haystack/document_stores/utils.py index 1e7c74c7c..5743a1502 100644 --- a/haystack/document_stores/utils.py +++ b/haystack/document_stores/utils.py @@ -2,6 +2,7 @@ from typing import Dict, List, Optional, Tuple, Union, Generator import json import logging +from datetime import datetime from haystack.schema import Document, Label, Answer, Span from haystack.nodes.preprocessor import PreProcessor @@ -250,3 +251,23 @@ def _extract_docs_and_labels_from_dict( labels.append(label) return docs, labels, problematic_ids + + +def convert_date_to_rfc3339(date: str) -> str: + """ + Converts a date to RFC3339 format, as Weaviate requires dates to be in RFC3339 format including the time and + timezone. + + If the provided date string does not contain a time and/or timezone, we use 00:00 as default time + and UTC as default time zone. + + This method cannot be part of WeaviateDocumentStore, as this would result in a circular import between weaviate.py + and filter_utils.py. + """ + parsed_datetime = datetime.fromisoformat(date) + if parsed_datetime.utcoffset() is None: + converted_date = parsed_datetime.isoformat() + "Z" + else: + converted_date = parsed_datetime.isoformat() + + return converted_date diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py index 61a2a3d1d..e4c110f28 100644 --- a/haystack/document_stores/weaviate.py +++ b/haystack/document_stores/weaviate.py @@ -2,6 +2,7 @@ import hashlib import re import uuid from typing import Dict, Generator, List, Optional, Union +from datetime import datetime import logging import json @@ -11,6 +12,8 @@ from tqdm import tqdm from haystack.schema import Document from haystack.document_stores import BaseDocumentStore from haystack.document_stores.base import get_batches_from_generator +from haystack.document_stores.filter_utils import LogicalFilterClause +from haystack.document_stores.utils import convert_date_to_rfc3339 try: from weaviate import client, AuthClientPassword @@ -225,8 +228,8 @@ class WeaviateDocumentStore(BaseDocumentStore): content = json.loads(str(props.get(self.content_field))) content_type = None - if props.get("contenttype") is not None: - content_type = str(props.pop("contenttype")) + if props.get("content_type") is not None: + content_type = str(props.pop("content_type")) # Weaviate creates "_additional" key for semantic search if "_additional" in props: @@ -337,30 +340,61 @@ class WeaviateDocumentStore(BaseDocumentStore): return cur_properties - def _build_filter_clause(self, filters: Dict[str, List[str]]) -> dict: + def _get_date_properties(self, index: Optional[str] = None) -> List[str]: """ - Transform Haystack filter conditions to Weaviate where filter clauses. + Get all existing properties of type 'date' in the schema. """ - weaviate_filters = [] - weaviate_filter = {} - for key, values in filters.items(): - for value in values: - weaviate_filter = {"path": [key], "operator": "Equal", "valueString": value} - weaviate_filters.append(weaviate_filter) - if len(weaviate_filters) > 1: - filter_dict = {"operator": "Or", "operands": weaviate_filters} - return filter_dict - else: - return weaviate_filter + index = self._sanitize_index_name(index) or self.index + cur_properties = [] + for class_item in self.weaviate_client.schema.get()["classes"]: + if class_item["class"] == index: + cur_properties = [item["name"] for item in class_item["properties"] if item["dataType"][0] == "date"] - def _update_schema(self, new_prop: str, index: Optional[str] = None): + return cur_properties + + def _update_schema( + self, new_prop: str, property_value: Union[List, str, int, float, bool], index: Optional[str] = None + ): """ Updates the schema with a new property. """ index = self._sanitize_index_name(index) or self.index - property_dict = {"dataType": ["string"], "description": f"dynamic property {new_prop}", "name": new_prop} + data_type = self._get_weaviate_type_of_value(property_value) + + property_dict = {"dataType": [data_type], "description": f"dynamic property {new_prop}", "name": new_prop} self.weaviate_client.schema.property.create(index, property_dict) + @staticmethod + def _get_weaviate_type_of_value(value: Union[List, str, int, float, bool]) -> str: + """ + Infers corresponding Weaviate data type for a value. + """ + data_type = "" + list_of_values = False + if isinstance(value, list): + list_of_values = True + value = value[0] + + if isinstance(value, str): + # If the value is parsable by datetime, it is a date + try: + convert_date_to_rfc3339(value) + data_type = "date" + # Otherwise, the value is a string + except ValueError: + data_type = "string" + elif isinstance(value, int): + data_type = "int" + elif isinstance(value, float): + data_type = "number" + elif isinstance(value, bool): + data_type = "boolean" + + if list_of_values: + data_type += "[]" + + return data_type + def _check_document(self, cur_props: List[str], doc: dict) -> List[str]: """ Find the properties in the document that don't exist in the existing schema. @@ -458,9 +492,6 @@ class WeaviateDocumentStore(BaseDocumentStore): if self.similarity == "cosine": self.normalize_embedding(vector) - # rename as weaviate doesn't like "_" in field names - _doc["contenttype"] = _doc.pop("content_type") - # Converting content to JSON-string as Weaviate doesn't allow other nested list for tables _doc["content"] = json.dumps(_doc["content"]) @@ -469,9 +500,14 @@ class WeaviateDocumentStore(BaseDocumentStore): missing_props = self._check_document(current_properties, _doc) if missing_props: for property in missing_props: - self._update_schema(property, index) + self._update_schema(property, _doc[property], index) current_properties.append(property) + # Weaviate requires dates to be in RFC3339 format + date_fields = self._get_date_properties(index) + for date_field in date_fields: + _doc[date_field] = convert_date_to_rfc3339(_doc[date_field]) + docs_batch.add(_doc, class_name=index, uuid=doc_id, vector=vector) # Ingest a batch of documents @@ -489,23 +525,43 @@ class WeaviateDocumentStore(BaseDocumentStore): progress_bar.update(batch_size) progress_bar.close() - def update_document_meta(self, id: str, meta: Dict[str, str], index: str = None): + def update_document_meta(self, id: str, meta: Dict[str, Union[List, str, int, float, bool]], index: str = None): """ Update the metadata dictionary of a document by specifying its string id. + Overwrites only the specified fields, the unspecified ones remain unchanged. """ if not index: index = self.index + + current_properties = self._get_current_properties(index) + + # Check if the new metadata contains additional properties and append them to the schema + missing_props = self._check_document(current_properties, meta) + if missing_props: + for property in missing_props: + self._update_schema(property, meta[property], index) + current_properties.append(property) + + # Weaviate requires dates to be in RFC3339 format + date_fields = self._get_date_properties(index) + for date_field in date_fields: + if isinstance(meta[date_field], str): + meta[date_field] = convert_date_to_rfc3339(str(meta[date_field])) + self.weaviate_client.data_object.update(meta, class_name=index, uuid=id) - def get_embedding_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int: + def get_embedding_count( + self, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None + ) -> int: """ - Return the number of embeddings in the document store, which is the same as the number of documents since every document has a default embedding + Return the number of embeddings in the document store, which is the same as the number of documents since + every document has a default embedding. """ return self.get_document_count(filters=filters, index=index) def get_document_count( self, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None, @@ -522,7 +578,7 @@ class WeaviateDocumentStore(BaseDocumentStore): index = self._sanitize_index_name(index) or self.index doc_count = 0 if filters: - filter_dict = self._build_filter_clause(filters=filters) + filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate() result = ( self.weaviate_client.query.aggregate(index).with_fields("meta { count }").with_where(filter_dict).do() ) @@ -538,7 +594,7 @@ class WeaviateDocumentStore(BaseDocumentStore): def get_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -548,8 +604,32 @@ class WeaviateDocumentStore(BaseDocumentStore): :param index: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. - :param filters: Optional filters to narrow down the documents to return. - Example: {"name": ["some", "more"], "category": ["only_one"]} + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` :param return_embedding: Whether to return the document embeddings. :param batch_size: When working with large number of documents, batching can help reduce memory footprint. """ @@ -566,7 +646,7 @@ class WeaviateDocumentStore(BaseDocumentStore): def _get_all_documents_in_index( self, index: Optional[str], - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, batch_size: int = 10_000, only_documents_without_embedding: bool = False, ) -> Generator[dict, None, None]: @@ -580,7 +660,7 @@ class WeaviateDocumentStore(BaseDocumentStore): properties.append("_additional {id, certainty, vector}") if filters: - filter_dict = self._build_filter_clause(filters=filters) + filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate() result = ( self.weaviate_client.query.get(class_name=index, properties=properties).with_where(filter_dict).do() ) @@ -597,7 +677,7 @@ class WeaviateDocumentStore(BaseDocumentStore): def get_all_documents_generator( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None, @@ -609,8 +689,32 @@ class WeaviateDocumentStore(BaseDocumentStore): :param index: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. - :param filters: Optional filters to narrow down the documents to return. - Example: {"name": ["some", "more"], "category": ["only_one"]} + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` :param return_embedding: Whether to return the document embeddings. :param batch_size: When working with large number of documents, batching can help reduce memory footprint. """ @@ -630,7 +734,7 @@ class WeaviateDocumentStore(BaseDocumentStore): def query( self, query: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, @@ -640,7 +744,69 @@ class WeaviateDocumentStore(BaseDocumentStore): that are most relevant to the query as defined by Weaviate semantic search. :param query: The query - :param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` :param top_k: How many documents to return per query. :param custom_query: Custom query that will executed using query.raw method, for more details refer https://www.semi.technology/developers/weaviate/current/graphql-references/filters.html @@ -655,7 +821,7 @@ class WeaviateDocumentStore(BaseDocumentStore): if custom_query: query_output = self.weaviate_client.query.raw(custom_query) elif filters: - filter_dict = self._build_filter_clause(filters) + filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate() query_output = ( self.weaviate_client.query.get(class_name=index, properties=properties) .with_where(filter_dict) @@ -684,7 +850,7 @@ class WeaviateDocumentStore(BaseDocumentStore): def query_by_embedding( self, query_emb: np.ndarray, - filters: Optional[dict] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, @@ -694,8 +860,69 @@ class WeaviateDocumentStore(BaseDocumentStore): Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. :param query_emb: Embedding of the query (e.g. gathered from DPR) - :param filters: Optional filters to narrow down the search space. - Example: {"name": ["some", "more"], "category": ["only_one"]} + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` :param top_k: How many documents to return :param index: index name for storing the docs and metadata :param return_embedding: To return document embedding @@ -719,7 +946,7 @@ class WeaviateDocumentStore(BaseDocumentStore): query_string = {"vector": query_emb} if filters: - filter_dict = self._build_filter_clause(filters) + filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate() query_output = ( self.weaviate_client.query.get(class_name=index, properties=properties) .with_where(filter_dict) @@ -751,7 +978,7 @@ class WeaviateDocumentStore(BaseDocumentStore): self, retriever, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000, ): @@ -763,8 +990,32 @@ class WeaviateDocumentStore(BaseDocumentStore): :param index: Index name to update :param update_existing_embeddings: Weaviate mandates an embedding while creating the document itself. This option must be always true for weaviate and it will update the embeddings for all the documents. - :param filters: Optional filters to narrow down the documents for which embeddings are to be updated. - Example: {"name": ["some", "more"], "category": ["only_one"]} + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` :param batch_size: When working with large number of documents, batching can help reduce memory footprint. :return: None """ @@ -808,13 +1059,38 @@ class WeaviateDocumentStore(BaseDocumentStore): def delete_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ): """ Delete documents in an index. All documents are deleted if no filters are passed. :param index: Index name to delete the document from. - :param filters: Optional filters to narrow down the documents to be deleted. + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` :return: None """ if headers: @@ -832,7 +1108,7 @@ class WeaviateDocumentStore(BaseDocumentStore): self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ): """ @@ -841,11 +1117,35 @@ class WeaviateDocumentStore(BaseDocumentStore): :param index: Index name to delete the document from. If None, the DocumentStore's default index (self.index) will be used. :param ids: Optional list of IDs to narrow down the documents to be deleted. - :param filters: Optional filters to narrow down the documents to be deleted. - Example filters: {"name": ["some", "more"], "category": ["only_one"]}. - If filters are provided along with a list of IDs, this method deletes the - intersection of the two query results (documents that match the filters and - have their ID in the list). + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + If filters are provided along with a list of IDs, this method deletes the + intersection of the two query results (documents that match the filters and + have their ID in the list). :return: None """ if headers: diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py index 05ececaaa..f715f48de 100644 --- a/haystack/nodes/reader/farm.py +++ b/haystack/nodes/reader/farm.py @@ -873,7 +873,7 @@ class FARMReader(BaseReader): ) # extract all questions for evaluation - filters = {"origin": [label_origin]} + filters: Dict = {"origin": [label_origin]} labels = document_store.get_all_labels(index=label_index, filters=filters) diff --git a/haystack/nodes/retriever/base.py b/haystack/nodes/retriever/base.py index 2a25fe4dd..1551a7a0c 100644 --- a/haystack/nodes/retriever/base.py +++ b/haystack/nodes/retriever/base.py @@ -124,7 +124,7 @@ class BaseRetriever(BaseComponent): """ # Extract all questions for evaluation - filters = {"origin": [label_origin]} + filters: Dict = {"origin": [label_origin]} timed_retrieve = self.timing(self.retrieve, "retrieve_time") diff --git a/haystack/utils/deepsetcloud.py b/haystack/utils/deepsetcloud.py index 6d875012e..4be170fa5 100644 --- a/haystack/utils/deepsetcloud.py +++ b/haystack/utils/deepsetcloud.py @@ -1,6 +1,6 @@ import logging import os -from typing import Any, Dict, Generator, List, Optional +from typing import Any, Dict, Generator, List, Optional, Union try: from typing import Literal @@ -266,7 +266,7 @@ class IndexClient: def query( self, query: Optional[str] = None, - filters: Optional[Dict[str, List[str]]] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, query_emb: Optional[List[float]] = None, diff --git a/test/conftest.py b/test/conftest.py index 314413b80..f2292876b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -323,7 +323,7 @@ def test_docs_xs(): "meta_field": "test2", "name": "filename2", "date_field": "2019-10-01", - "numeric_field": 5, + "numeric_field": 5.0, }, # Document object for a doc Document( @@ -332,11 +332,11 @@ def test_docs_xs(): ), Document( content="My name is Camila and I live in Madrid", - meta={"meta_field": "test4", "name": "filename4", "date_field": "2021-02-01", "numeric_field": 3}, + meta={"meta_field": "test4", "name": "filename4", "date_field": "2021-02-01", "numeric_field": 3.0}, ), Document( content="My name is Matteo and I live in Rome", - meta={"meta_field": "test5", "name": "filename5", "date_field": "2019-01-01", "numeric_field": 0}, + meta={"meta_field": "test5", "name": "filename5", "date_field": "2019-01-01", "numeric_field": 0.0}, ), ] @@ -530,16 +530,6 @@ def document_store_with_docs(request, test_docs_xs, tmp_path): document_store = get_document_store( document_store_type=request.param, embedding_dim=embedding_dim.args[0], tmp_path=tmp_path ) - # TODO: remove the following part once we allow numbers as metadatfield value in WeaviateDocumentStore - if request.param == "weaviate": - for doc in test_docs_xs: - if isinstance(doc, Document): - doc.meta["numeric_field"] = str(doc.meta["numeric_field"]) - else: - if "meta" in doc: - doc["meta"]["numeric_field"] = str(doc["meta"]["numeric_field"]) - else: - doc["numeric_field"] = str(doc["numeric_field"]) document_store.write_documents(test_docs_xs) yield document_store document_store.delete_documents() diff --git a/test/test_document_store.py b/test/test_document_store.py index d76b23f19..1d817d745 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -216,7 +216,7 @@ def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs) assert len(documents) == 0 -@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch", "weaviate"], indirect=True) def test_extended_filter(document_store_with_docs): # Test comparison operators individually documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test1"}}) @@ -235,16 +235,16 @@ def test_extended_filter(document_store_with_docs): documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$nin": ["test1", "test2", "n.a."]}}) assert len(documents) == 3 - documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3}}) + documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gt": 3.0}}) assert len(documents) == 3 - documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3}}) + documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$gte": 3.0}}) assert len(documents) == 4 - documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3}}) + documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lt": 3.0}}) assert len(documents) == 1 - documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3}}) + documents = document_store_with_docs.get_all_documents(filters={"numeric_field": {"$lte": 3.0}}) assert len(documents) == 2 # Test compound filters @@ -265,29 +265,34 @@ def test_extended_filter(document_store_with_docs): "name": ["filename5", "filename3"], } documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified) - assert documents == documents_simplified_filter + # Order of returned documents might differ + assert len(documents) == len(documents_simplified_filter) and all( + doc in documents_simplified_filter for doc in documents + ) filters = { "$and": { "date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, - "$or": {"name": {"$in": ["filename5", "filename3"]}, "numeric_field": {"$lte": 5}}, + "$or": {"name": {"$in": ["filename5", "filename3"]}, "numeric_field": {"$lte": 5.0}}, } } documents = document_store_with_docs.get_all_documents(filters=filters) assert len(documents) == 2 filters_simplified = { "date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, - "$or": {"name": ["filename5", "filename3"], "numeric_field": {"$lte": 5}}, + "$or": {"name": ["filename5", "filename3"], "numeric_field": {"$lte": 5.0}}, } documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified) - assert documents == documents_simplified_filter + assert len(documents) == len(documents_simplified_filter) and all( + doc in documents_simplified_filter for doc in documents + ) filters = { "$and": { "date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, "$or": { "name": {"$in": ["filename5", "filename3"]}, - "$and": {"numeric_field": {"$lte": 5}, "$not": {"meta_field": {"$eq": "test2"}}}, + "$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": {"$eq": "test2"}}}, }, } } @@ -297,11 +302,28 @@ def test_extended_filter(document_store_with_docs): "date_field": {"$lte": "2020-12-31", "$gte": "2019-01-01"}, "$or": { "name": ["filename5", "filename3"], - "$and": {"numeric_field": {"$lte": 5}, "$not": {"meta_field": "test2"}}, + "$and": {"numeric_field": {"$lte": 5.0}, "$not": {"meta_field": "test2"}}, }, } documents_simplified_filter = document_store_with_docs.get_all_documents(filters=filters_simplified) - assert documents == documents_simplified_filter + assert len(documents) == len(documents_simplified_filter) and all( + doc in documents_simplified_filter for doc in documents + ) + + # Test nested logical operations within "$not", important as we apply De Morgan's laws in WeaviateDocumentstore + filters = { + "$not": { + "$or": { + "$and": {"numeric_field": {"$gt": 3.0}, "meta_field": {"$ne": "test3"}}, + "$not": {"date_field": {"$lt": "2020-01-01"}}, + } + } + } + documents = document_store_with_docs.get_all_documents(filters=filters) + docs_meta = [doc.meta["meta_field"] for doc in documents] + assert len(documents) == 2 + assert "test3" in docs_meta + assert "test5" in docs_meta # Test same logical operator twice on same level filters = {