mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 09:49:48 +00:00 
			
		
		
		
	refactor: Add support for new filters declaration (#6397)
* Rework filter logic for InMemoryDocumentStore to support new filters declaration * Fix legacy filters tests * Simplify logic and handle dates comparison * Rework MetadataRouter to support new filters * Update docstrings * Add release notes * Fix linting * Avoid duplicating filters specifications * Handle corner case * Simplify docstring * Fix filters logic and tests * Fix Document Store testing legacy filters tests
This commit is contained in:
		
							parent
							
								
									28c2b09d90
								
							
						
					
					
						commit
						fd16ec63cb
					
				| @ -18,7 +18,7 @@ def test_preprocessing_pipeline(tmp_path): | ||||
|     preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter") | ||||
|     preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier") | ||||
|     preprocessing_pipeline.add_component( | ||||
|         instance=MetadataRouter(rules={"en": {"language": {"$eq": "en"}}}), name="router" | ||||
|         instance=MetadataRouter(rules={"en": {"field": "language", "operator": "==", "value": "en"}}), name="router" | ||||
|     ) | ||||
|     preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner") | ||||
|     preprocessing_pipeline.add_component( | ||||
|  | ||||
| @ -1,7 +1,7 @@ | ||||
| from typing import Dict, List | ||||
| 
 | ||||
| from haystack.preview import component, Document | ||||
| from haystack.preview.utils.filters import document_matches_filter | ||||
| from haystack.preview.utils.filters import document_matches_filter, convert | ||||
| 
 | ||||
| 
 | ||||
| @component | ||||
| @ -19,10 +19,34 @@ class MetadataRouter: | ||||
|                       follow the format of filtering expressions in Haystack. For example: | ||||
|                       ```python | ||||
|                       { | ||||
|                             "edge_1": {"created_at": {"$gte": "2023-01-01", "$lt": "2023-04-01"}}, | ||||
|                             "edge_2": {"created_at": {"$gte": "2023-04-01", "$lt": "2023-07-01"}}, | ||||
|                             "edge_3": {"created_at": {"$gte": "2023-07-01", "$lt": "2023-10-01"}}, | ||||
|                             "edge_4": {"created_at": {"$gte": "2023-10-01", "$lt": "2024-01-01"}}, | ||||
|                         "edge_1": { | ||||
|                             "operator": "AND", | ||||
|                             "conditions": [ | ||||
|                                 {"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"}, | ||||
|                                 {"field": "meta.created_at", "operator": "<", "value": "2023-04-01"}, | ||||
|                             ], | ||||
|                         }, | ||||
|                         "edge_2": { | ||||
|                             "operator": "AND", | ||||
|                             "conditions": [ | ||||
|                                 {"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"}, | ||||
|                                 {"field": "meta.created_at", "operator": "<", "value": "2023-07-01"}, | ||||
|                             ], | ||||
|                         }, | ||||
|                         "edge_3": { | ||||
|                             "operator": "AND", | ||||
|                             "conditions": [ | ||||
|                                 {"field": "meta.created_at", "operator": ">=", "value": "2023-07-01"}, | ||||
|                                 {"field": "meta.created_at", "operator": "<", "value": "2023-10-01"}, | ||||
|                             ], | ||||
|                         }, | ||||
|                         "edge_4": { | ||||
|                             "operator": "AND", | ||||
|                             "conditions": [ | ||||
|                                 {"field": "meta.created_at", "operator": ">=", "value": "2023-10-01"}, | ||||
|                                 {"field": "meta.created_at", "operator": "<", "value": "2024-01-01"}, | ||||
|                             ], | ||||
|                         }, | ||||
|                     } | ||||
|                     ``` | ||||
|         """ | ||||
| @ -43,6 +67,9 @@ class MetadataRouter: | ||||
|         for document in documents: | ||||
|             cur_document_matched = False | ||||
|             for edge, rule in self.rules.items(): | ||||
|                 if "operator" not in rule: | ||||
|                     # Must be a legacy filter, convert it | ||||
|                     rule = convert(rule) | ||||
|                 if document_matches_filter(rule, document): | ||||
|                     output[edge].append(document) | ||||
|                     cur_document_matched = True | ||||
|  | ||||
| @ -11,7 +11,7 @@ from haystack.preview import default_from_dict, default_to_dict | ||||
| from haystack.preview.document_stores.decorator import document_store | ||||
| from haystack.preview.dataclasses import Document | ||||
| from haystack.preview.document_stores.protocols import DuplicatePolicy | ||||
| from haystack.preview.utils.filters import document_matches_filter | ||||
| from haystack.preview.utils.filters import document_matches_filter, convert | ||||
| from haystack.preview.document_stores.errors import DuplicateDocumentError, DocumentStoreError | ||||
| from haystack.preview.utils import expit | ||||
| 
 | ||||
| @ -92,75 +92,15 @@ class InMemoryDocumentStore: | ||||
|         """ | ||||
|         Returns the documents that match the filters provided. | ||||
| 
 | ||||
|         Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`, | ||||
|         `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`, | ||||
|         `"$lte"`) or a metadata field name. | ||||
| 
 | ||||
|         Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata | ||||
|         field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or | ||||
|         (in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default | ||||
|         operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used | ||||
|         as default operation. | ||||
| 
 | ||||
|         Example: | ||||
| 
 | ||||
|         ```python | ||||
|         filters = { | ||||
|             "$and": { | ||||
|                 "type": {"$eq": "article"}, | ||||
|                 "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, | ||||
|                 "rating": {"$gte": 3}, | ||||
|                 "$or": { | ||||
|                     "genre": {"$in": ["economy", "politics"]}, | ||||
|                     "publisher": {"$eq": "nytimes"} | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|         # or simpler using default operators | ||||
|         filters = { | ||||
|             "type": "article", | ||||
|             "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, | ||||
|             "rating": {"$gte": 3}, | ||||
|             "$or": { | ||||
|                 "genre": ["economy", "politics"], | ||||
|                 "publisher": "nytimes" | ||||
|             } | ||||
|         } | ||||
|         ``` | ||||
| 
 | ||||
|         To use the same logical operator multiple times on the same level, logical operators can take a list of | ||||
|         dictionaries as value. | ||||
| 
 | ||||
|         Example: | ||||
| 
 | ||||
|         ```python | ||||
|         filters = { | ||||
|             "$or": [ | ||||
|                 { | ||||
|                     "$and": { | ||||
|                         "Type": "News Paper", | ||||
|                         "Date": { | ||||
|                             "$lt": "2019-01-01" | ||||
|                         } | ||||
|                     } | ||||
|                 }, | ||||
|                 { | ||||
|                     "$and": { | ||||
|                         "Type": "Blog Post", | ||||
|                         "Date": { | ||||
|                             "$gte": "2019-01-01" | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             ] | ||||
|         } | ||||
|         ``` | ||||
|         For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol documentation. | ||||
| 
 | ||||
|         :param filters: The filters to apply to the document list. | ||||
|         :return: A list of Documents that match the given filters. | ||||
|         """ | ||||
|         if filters: | ||||
|             return [doc for doc in self.storage.values() if document_matches_filter(conditions=filters, document=doc)] | ||||
|             if "operator" not in filters: | ||||
|                 filters = convert(filters) | ||||
|             return [doc for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)] | ||||
|         return list(self.storage.values()) | ||||
| 
 | ||||
|     def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> int: | ||||
| @ -220,9 +160,17 @@ class InMemoryDocumentStore: | ||||
|         if not query: | ||||
|             raise ValueError("Query should be a non-empty string") | ||||
| 
 | ||||
|         content_type_filter = {"$or": {"content": {"$not": None}, "dataframe": {"$not": None}}} | ||||
|         content_type_filter = { | ||||
|             "operator": "OR", | ||||
|             "conditions": [ | ||||
|                 {"field": "content", "operator": "!=", "value": None}, | ||||
|                 {"field": "dataframe", "operator": "!=", "value": None}, | ||||
|             ], | ||||
|         } | ||||
|         if filters: | ||||
|             filters = {"$and": [content_type_filter, filters]} | ||||
|             if "operator" not in filters: | ||||
|                 filters = convert(filters) | ||||
|             filters = {"operator": "AND", "conditions": [content_type_filter, filters]} | ||||
|         else: | ||||
|             filters = content_type_filter | ||||
|         all_documents = self.filter_documents(filters=filters) | ||||
|  | ||||
| @ -51,69 +51,64 @@ class DocumentStore(Protocol): | ||||
|         """ | ||||
|         Returns the documents that match the filters provided. | ||||
| 
 | ||||
|         Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`, | ||||
|         `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`, | ||||
|         `"$lte"`) or a metadata field name. | ||||
|         Filters are defined as nested dictionaries that can be of two types: | ||||
|         - Comparison | ||||
|         - Logic | ||||
| 
 | ||||
|         Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata | ||||
|         field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or | ||||
|         (in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default | ||||
|         operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used | ||||
|         as default operation. | ||||
|         Comparison dictionaries must contain the keys: | ||||
| 
 | ||||
|         Example: | ||||
|         - `field` | ||||
|         - `operator` | ||||
|         - `value` | ||||
| 
 | ||||
|         Logic dictionaries must contain the keys: | ||||
| 
 | ||||
|         - `operator` | ||||
|         - `conditions` | ||||
| 
 | ||||
|         The `conditions` key must be a list of dictionaries, either of type Comparison or Logic. | ||||
| 
 | ||||
|         The `operator` value in Comparison dictionaries must be one of: | ||||
| 
 | ||||
|         - `==` | ||||
|         - `!=` | ||||
|         - `>` | ||||
|         - `>=` | ||||
|         - `<` | ||||
|         - `<=` | ||||
|         - `in` | ||||
|         - `not in` | ||||
| 
 | ||||
|         The `operator` values in Logic dictionaries must be one of: | ||||
| 
 | ||||
|         - `NOT` | ||||
|         - `OR` | ||||
|         - `AND` | ||||
| 
 | ||||
| 
 | ||||
|         A simple filter: | ||||
|         ```python | ||||
|         filters = { | ||||
|             "$and": { | ||||
|                 "type": {"$eq": "article"}, | ||||
|                 "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, | ||||
|                 "rating": {"$gte": 3}, | ||||
|                 "$or": { | ||||
|                     "genre": {"$in": ["economy", "politics"]}, | ||||
|                     "publisher": {"$eq": "nytimes"} | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|         # or simpler using default operators | ||||
|         filters = { | ||||
|             "type": "article", | ||||
|             "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, | ||||
|             "rating": {"$gte": 3}, | ||||
|             "$or": { | ||||
|                 "genre": ["economy", "politics"], | ||||
|                 "publisher": "nytimes" | ||||
|             } | ||||
|         } | ||||
|         filters = {"field": "meta.type", "operator": "==", "value": "article"} | ||||
|         ``` | ||||
| 
 | ||||
|         To use the same logical operator multiple times on the same level, logical operators can take a list of | ||||
|         dictionaries as value. | ||||
| 
 | ||||
|         Example: | ||||
| 
 | ||||
|         A more complex filter: | ||||
|         ```python | ||||
|         filters = { | ||||
|             "$or": [ | ||||
|             "operator": "AND", | ||||
|             "conditions": [ | ||||
|                 {"field": "meta.type", "operator": "==", "value": "article"}, | ||||
|                 {"field": "meta.date", "operator": ">=", "value": 1420066800}, | ||||
|                 {"field": "meta.date", "operator": "<", "value": 1609455600}, | ||||
|                 {"field": "meta.rating", "operator": ">=", "value": 3}, | ||||
|                 { | ||||
|                     "$and": { | ||||
|                         "Type": "News Paper", | ||||
|                         "Date": { | ||||
|                             "$lt": "2019-01-01" | ||||
|                         } | ||||
|                     } | ||||
|                     "operator": "OR", | ||||
|                     "conditions": [ | ||||
|                         {"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]}, | ||||
|                         {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, | ||||
|                     ], | ||||
|                 }, | ||||
|                 { | ||||
|                     "$and": { | ||||
|                         "Type": "Blog Post", | ||||
|                         "Date": { | ||||
|                             "$gte": "2019-01-01" | ||||
|             ], | ||||
|         } | ||||
|                     } | ||||
|                 } | ||||
|             ] | ||||
|         } | ||||
|         ``` | ||||
| 
 | ||||
|         :param filters: the filters to apply to the document list. | ||||
|         :return: a list of Documents that match the given filters. | ||||
|  | ||||
| @ -236,7 +236,7 @@ class LegacyFilterDocumentsInvalidFiltersTest(FilterableDocsFixtureMixin): | ||||
|     @pytest.mark.unit | ||||
|     def test_incorrect_filter_type(self, document_store: DocumentStore, filterable_docs: List[Document]): | ||||
|         document_store.write_documents(filterable_docs) | ||||
|         with pytest.raises(FilterError): | ||||
|         with pytest.raises(ValueError): | ||||
|             document_store.filter_documents(filters="something odd")  # type: ignore | ||||
| 
 | ||||
|     @pytest.mark.unit | ||||
| @ -574,7 +574,9 @@ class LegacyFilterDocumentsLessThanTest(FilterableDocsFixtureMixin): | ||||
|     def test_lt_filter(self, document_store: DocumentStore, filterable_docs: List[Document]): | ||||
|         document_store.write_documents(filterable_docs) | ||||
|         result = document_store.filter_documents(filters={"number": {"$lt": 0.0}}) | ||||
|         assert result == [doc for doc in filterable_docs if "number" in doc.meta and doc.meta["number"] < 0] | ||||
|         assert result == [ | ||||
|             doc for doc in filterable_docs if doc.meta.get("number") is not None and doc.meta["number"] < 0 | ||||
|         ] | ||||
| 
 | ||||
|     @pytest.mark.unit | ||||
|     def test_lt_filter_non_numeric(self, document_store: DocumentStore, filterable_docs: List[Document]): | ||||
| @ -614,7 +616,9 @@ class LegacyFilterDocumentsLessThanEqualTest(FilterableDocsFixtureMixin): | ||||
|     def test_lte_filter(self, document_store: DocumentStore, filterable_docs: List[Document]): | ||||
|         document_store.write_documents(filterable_docs) | ||||
|         result = document_store.filter_documents(filters={"number": {"$lte": 2.0}}) | ||||
|         assert result == [doc for doc in filterable_docs if "number" in doc.meta and doc.meta["number"] <= 2.0] | ||||
|         assert result == [ | ||||
|             doc for doc in filterable_docs if doc.meta.get("number") is not None and doc.meta["number"] <= 2.0 | ||||
|         ] | ||||
| 
 | ||||
|     @pytest.mark.unit | ||||
|     def test_lte_filter_non_numeric(self, document_store: DocumentStore, filterable_docs: List[Document]): | ||||
| @ -658,7 +662,8 @@ class LegacyFilterDocumentsSimpleLogicalTest(FilterableDocsFixtureMixin): | ||||
|         assert result == [ | ||||
|             doc | ||||
|             for doc in filterable_docs | ||||
|             if (("number" in doc.meta and doc.meta["number"] < 1) or doc.meta.get("name") in ["name_0", "name_1"]) | ||||
|             if (doc.meta.get("number") is not None and doc.meta["number"] < 1) | ||||
|             or doc.meta.get("name") in ["name_0", "name_1"] | ||||
|         ] | ||||
| 
 | ||||
|     @pytest.mark.unit | ||||
| @ -733,7 +738,10 @@ class LegacyFilterDocumentsNestedLogicalTest(FilterableDocsFixtureMixin): | ||||
|         assert result == [ | ||||
|             doc | ||||
|             for doc in filterable_docs | ||||
|             if (doc.meta.get("name") in ["name_0", "name_1"] or ("number" in doc.meta and doc.meta["number"] < 1)) | ||||
|             if ( | ||||
|                 doc.meta.get("name") in ["name_0", "name_1"] | ||||
|                 or (doc.meta.get("number") is not None and doc.meta["number"] < 1) | ||||
|             ) | ||||
|         ] | ||||
| 
 | ||||
|     @pytest.mark.unit | ||||
| @ -783,11 +791,8 @@ class LegacyFilterDocumentsNestedLogicalTest(FilterableDocsFixtureMixin): | ||||
|             doc | ||||
|             for doc in filterable_docs | ||||
|             if ( | ||||
|                 ("number" in doc.meta and doc.meta["number"] < 1) | ||||
|                 or ( | ||||
|                     doc.meta.get("name") in ["name_0", "name_1"] | ||||
|                     and ("chapter" in doc.meta and doc.meta["chapter"] != "intro") | ||||
|                 ) | ||||
|                 (doc.meta.get("number") is not None and doc.meta["number"] < 1) | ||||
|                 or (doc.meta.get("name") in ["name_0", "name_1"] and (doc.meta.get("chapter") != "intro")) | ||||
|             ) | ||||
|         ] | ||||
| 
 | ||||
|  | ||||
| @ -1,297 +1,174 @@ | ||||
| from typing import List, Any, Union, Dict | ||||
| from dataclasses import fields | ||||
| from datetime import datetime | ||||
| 
 | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| 
 | ||||
| from haystack.preview.dataclasses import Document | ||||
| from haystack.preview.errors import FilterError | ||||
| 
 | ||||
| 
 | ||||
| GT_TYPES = (int, float, np.number) | ||||
| IN_TYPES = (list, set, tuple) | ||||
| 
 | ||||
| 
 | ||||
| def not_operation(conditions: List[Any], document: Document, _current_key: str): | ||||
| def document_matches_filter(filters: Dict[str, Any], document: Document) -> bool: | ||||
|     """ | ||||
|     Applies a NOT to all the nested conditions. | ||||
| 
 | ||||
|     :param conditions: the filters dictionary. | ||||
|     :param document: the document to test. | ||||
|     :param _current_key: internal, don't use. | ||||
|     :return: True if the document matches the negated filters, False otherwise | ||||
|     Return whether `filters` match the Document. | ||||
|     For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol documentation. | ||||
|     """ | ||||
|     return not and_operation(conditions=conditions, document=document, _current_key=_current_key) | ||||
|     if "field" in filters: | ||||
|         return _comparison_condition(filters, document) | ||||
|     return _logic_condition(filters, document) | ||||
| 
 | ||||
| 
 | ||||
| def and_operation(conditions: List[Any], document: Document, _current_key: str): | ||||
|     """ | ||||
|     Applies an AND to all the nested conditions. | ||||
| 
 | ||||
|     :param conditions: the filters dictionary. | ||||
|     :param document: the document to test. | ||||
|     :param _current_key: internal, don't use. | ||||
|     :return: True if the document matches all the filters, False otherwise | ||||
|     """ | ||||
|     return all( | ||||
|         document_matches_filter(conditions=condition, document=document, _current_key=_current_key) | ||||
|         for condition in conditions | ||||
|     ) | ||||
| def _and(document: Document, conditions: List[Dict[str, Any]]) -> bool: | ||||
|     return all(_comparison_condition(condition, document) for condition in conditions) | ||||
| 
 | ||||
| 
 | ||||
| def or_operation(conditions: List[Any], document: Document, _current_key: str): | ||||
|     """ | ||||
|     Applies an OR to all the nested conditions. | ||||
| 
 | ||||
|     :param conditions: the filters dictionary. | ||||
|     :param document: the document to test. | ||||
|     :param _current_key: internal, don't use. | ||||
|     :return: True if the document matches any of the filters, False otherwise | ||||
|     """ | ||||
|     return any( | ||||
|         document_matches_filter(conditions=condition, document=document, _current_key=_current_key) | ||||
|         for condition in conditions | ||||
|     ) | ||||
| def _or(document: Document, conditions: List[Dict[str, Any]]) -> bool: | ||||
|     return any(_comparison_condition(condition, document) for condition in conditions) | ||||
| 
 | ||||
| 
 | ||||
| def _safe_eq(first: Any, second: Any) -> bool: | ||||
|     """ | ||||
|     Compares objects for equality, even np.ndarrays and pandas DataFrames. | ||||
|     """ | ||||
| 
 | ||||
|     if isinstance(first, pd.DataFrame): | ||||
|         first = first.to_json() | ||||
| 
 | ||||
|     if isinstance(second, pd.DataFrame): | ||||
|         second = second.to_json() | ||||
| 
 | ||||
|     if isinstance(first, np.ndarray): | ||||
|         first = first.tolist() | ||||
| 
 | ||||
|     if isinstance(second, np.ndarray): | ||||
|         second = second.tolist() | ||||
| 
 | ||||
|     return first == second | ||||
| def _not(document: Document, conditions: List[Dict[str, Any]]) -> bool: | ||||
|     return not _and(document, conditions) | ||||
| 
 | ||||
| 
 | ||||
| def _safe_gt(first: Any, second: Any) -> bool: | ||||
|     """ | ||||
|     Checks if first is bigger than second. | ||||
| LOGICAL_OPERATORS = {"NOT": _not, "OR": _or, "AND": _and} | ||||
| 
 | ||||
|     Works only for numerical values and dates in ISO format (YYYY-MM-DD). Strings, lists, tables and tensors all raise exceptions. | ||||
|     """ | ||||
|     if not isinstance(first, GT_TYPES) or not isinstance(second, GT_TYPES): | ||||
| 
 | ||||
| def _equal(document_value: Any, filter_value: Any) -> bool: | ||||
|     if isinstance(document_value, pd.DataFrame): | ||||
|         document_value = document_value.to_json() | ||||
| 
 | ||||
|     if isinstance(filter_value, pd.DataFrame): | ||||
|         filter_value = filter_value.to_json() | ||||
| 
 | ||||
|     return document_value == filter_value | ||||
| 
 | ||||
| 
 | ||||
| def _not_equal(document_value: Any, filter_value: Any) -> bool: | ||||
|     return not _equal(document_value=document_value, filter_value=filter_value) | ||||
| 
 | ||||
| 
 | ||||
| def _greater_than(document_value: Any, filter_value: Any) -> bool: | ||||
|     if document_value is None or filter_value is None: | ||||
|         # We can't compare None values reliably using operators '>', '>=', '<', '<=' | ||||
|         return False | ||||
| 
 | ||||
|     if isinstance(document_value, str) or isinstance(filter_value, str): | ||||
|         try: | ||||
|             first = datetime.fromisoformat(first) | ||||
|             second = datetime.fromisoformat(second) | ||||
|         except (ValueError, TypeError): | ||||
|             raise FilterError( | ||||
|                 f"Can't evaluate '{type(first).__name__} > {type(second).__name__}'. " | ||||
|                 f"Convert these values into one of the following types: {[type_.__name__ for type_ in GT_TYPES]} " | ||||
|                 f"or a datetime string in ISO 8601 format." | ||||
|             document_value = datetime.fromisoformat(document_value) | ||||
|             filter_value = datetime.fromisoformat(filter_value) | ||||
|         except (ValueError, TypeError) as exc: | ||||
|             msg = ( | ||||
|                 "Can't compare strings using operators '>', '>=', '<', '<='. " | ||||
|                 "Strings are only comparable if they are ISO formatted dates." | ||||
|             ) | ||||
|     return bool(first > second) | ||||
|             raise FilterError(msg) from exc | ||||
|     if type(filter_value) in [list, pd.DataFrame]: | ||||
|         msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='" | ||||
|         raise FilterError(msg) | ||||
|     return document_value > filter_value | ||||
| 
 | ||||
| 
 | ||||
| def eq_operation(fields, field_name, value): | ||||
|     """ | ||||
|     Checks for equality between the document's field value value and a fixed value. | ||||
| 
 | ||||
|     :param fields: all the document's field value | ||||
|     :param field_name: the field to test | ||||
|     :param value: the fixed value to compare against | ||||
|     :return: True if the values are equal, False otherwise | ||||
|     """ | ||||
|     if not field_name in fields: | ||||
| def _greater_than_equal(document_value: Any, filter_value: Any) -> bool: | ||||
|     if document_value is None or filter_value is None: | ||||
|         # We can't compare None values reliably using operators '>', '>=', '<', '<=' | ||||
|         return False | ||||
| 
 | ||||
|     return _safe_eq(fields[field_name], value) | ||||
|     return _equal(document_value=document_value, filter_value=filter_value) or _greater_than( | ||||
|         document_value=document_value, filter_value=filter_value | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def in_operation(fields, field_name, value): | ||||
|     """ | ||||
|     Checks for whether the document's field value value is present into the given list. | ||||
| 
 | ||||
|     :param fields: all the document's field value | ||||
|     :param field_name: the field to test | ||||
|     :param value; the fixed value to compare against | ||||
|     :return: True if the document's value is included in the given list, False otherwise | ||||
|     """ | ||||
|     if not field_name in fields: | ||||
| def _less_than(document_value: Any, filter_value: Any) -> bool: | ||||
|     if document_value is None or filter_value is None: | ||||
|         # We can't compare None values reliably using operators '>', '>=', '<', '<=' | ||||
|         return False | ||||
| 
 | ||||
|     if not isinstance(value, IN_TYPES): | ||||
|         raise FilterError("$in accepts only iterable values like lists, sets and tuples.") | ||||
| 
 | ||||
|     return any(_safe_eq(fields[field_name], v) for v in value) | ||||
|     return not _greater_than_equal(document_value=document_value, filter_value=filter_value) | ||||
| 
 | ||||
| 
 | ||||
| def ne_operation(fields, field_name, value): | ||||
|     """ | ||||
|     Checks for inequality between the document's field value value and a fixed value. | ||||
| 
 | ||||
|     :param fields: all the document's field value | ||||
|     :param field_name: the field to test | ||||
|     :param value; the fixed value to compare against | ||||
|     :return: True if the values are different, False otherwise | ||||
|     """ | ||||
|     return not eq_operation(fields, field_name, value) | ||||
| 
 | ||||
| 
 | ||||
| def nin_operation(fields, field_name, value): | ||||
|     """ | ||||
|     Checks whether the document's field value value is absent from the given list. | ||||
| 
 | ||||
|     :param fields: all the document's field value | ||||
|     :param field_name: the field to test | ||||
|     :param value; the fixed value to compare against | ||||
|     :return: True if the document's value is not included in the given list, False otherwise | ||||
|     """ | ||||
|     return not in_operation(fields, field_name, value) | ||||
| 
 | ||||
| 
 | ||||
| def gt_operation(fields, field_name, value): | ||||
|     """ | ||||
|     Checks whether the document's field value value is (strictly) larger than the given value. | ||||
| 
 | ||||
|     :param fields: all the document's field value | ||||
|     :param field_name: the field to test | ||||
|     :param value; the fixed value to compare against | ||||
|     :return: True if the document's value is strictly larger than the fixed value, False otherwise | ||||
|     """ | ||||
|     if not field_name in fields: | ||||
| def _less_than_equal(document_value: Any, filter_value: Any) -> bool: | ||||
|     if document_value is None or filter_value is None: | ||||
|         # We can't compare None values reliably using operators '>', '>=', '<', '<=' | ||||
|         return False | ||||
|     return _safe_gt(fields[field_name], value) | ||||
| 
 | ||||
|     return not _greater_than(document_value=document_value, filter_value=filter_value) | ||||
| 
 | ||||
| 
 | ||||
| def gte_operation(fields, field_name, value): | ||||
|     """ | ||||
|     Checks whether the document's field value value is larger than or equal to the given value. | ||||
| 
 | ||||
|     :param fields: all the document's field value | ||||
|     :param field_name: the field to test | ||||
|     :param value; the fixed value to compare against | ||||
|     :return: True if the document's value is larger than or equal to the fixed value, False otherwise | ||||
|     """ | ||||
|     return gt_operation(fields, field_name, value) or eq_operation(fields, field_name, value) | ||||
| def _in(document_value: Any, filter_value: Any) -> bool: | ||||
|     if not isinstance(filter_value, list): | ||||
|         msg = ( | ||||
|             f"Filter value must be a `list` when using operator 'in' or 'not in', received type '{type(filter_value)}'" | ||||
|         ) | ||||
|         raise FilterError(msg) | ||||
|     return any(_equal(e, document_value) for e in filter_value) | ||||
| 
 | ||||
| 
 | ||||
| def lt_operation(fields, field_name, value): | ||||
|     """ | ||||
|     Checks whether the document's field value value is (strictly) smaller than the given value. | ||||
| 
 | ||||
|     :param fields: all the document's field value | ||||
|     :param field_name: the field to test | ||||
|     :param value; the fixed value to compare against | ||||
|     :return: True if the document's value is strictly smaller than the fixed value, False otherwise | ||||
|     """ | ||||
|     if not field_name in fields: | ||||
|         return False | ||||
|     return not _safe_gt(fields[field_name], value) and not _safe_eq(fields[field_name], value) | ||||
| def _not_in(document_value: Any, filter_value: Any) -> bool: | ||||
|     return not _in(document_value=document_value, filter_value=filter_value) | ||||
| 
 | ||||
| 
 | ||||
| def lte_operation(fields, field_name, value): | ||||
|     """ | ||||
|     Checks whether the document's field value value is smaller than or equal to the given value. | ||||
| 
 | ||||
|     :param fields: all the document's field value | ||||
|     :param field_name: the field to test | ||||
|     :param value; the fixed value to compare against | ||||
|     :return: True if the document's value is smaller than or equal to the fixed value, False otherwise | ||||
|     """ | ||||
|     if not field_name in fields: | ||||
|         return False | ||||
|     return not _safe_gt(fields[field_name], value) | ||||
| 
 | ||||
| 
 | ||||
| LOGICAL_STATEMENTS = {"$not": not_operation, "$and": and_operation, "$or": or_operation} | ||||
| OPERATORS = { | ||||
|     "$eq": eq_operation, | ||||
|     "$in": in_operation, | ||||
|     "$ne": ne_operation, | ||||
|     "$nin": nin_operation, | ||||
|     "$gt": gt_operation, | ||||
|     "$gte": gte_operation, | ||||
|     "$lt": lt_operation, | ||||
|     "$lte": lte_operation, | ||||
| COMPARISON_OPERATORS = { | ||||
|     "==": _equal, | ||||
|     "!=": _not_equal, | ||||
|     ">": _greater_than, | ||||
|     ">=": _greater_than_equal, | ||||
|     "<": _less_than, | ||||
|     "<=": _less_than_equal, | ||||
|     "in": _in, | ||||
|     "not in": _not_in, | ||||
| } | ||||
| RESERVED_KEYS = [*LOGICAL_STATEMENTS.keys(), *OPERATORS.keys()] | ||||
| 
 | ||||
| 
 | ||||
| def document_matches_filter(conditions: Union[Dict, List], document: Document, _current_key=None): | ||||
|     """ | ||||
|     Check if a document's metadata matches the provided filter conditions. | ||||
| def _logic_condition(condition: Dict[str, Any], document: Document) -> bool: | ||||
|     if "operator" not in condition: | ||||
|         msg = f"'operator' key missing in {condition}" | ||||
|         raise FilterError(msg) | ||||
|     if "conditions" not in condition: | ||||
|         msg = f"'conditions' key missing in {condition}" | ||||
|         raise FilterError(msg) | ||||
|     operator: str = condition["operator"] | ||||
|     conditions: List[Dict[str, Any]] = condition["conditions"] | ||||
|     return LOGICAL_OPERATORS[operator](document, conditions) | ||||
| 
 | ||||
|     This function evaluates the specified conditions against the metadata of the given document | ||||
|     and returns True if the conditions are met, otherwise it returns False. | ||||
| 
 | ||||
|     :param conditions: A dictionary or list containing filter conditions to be applied to the document's metadata. | ||||
|     :param document: The document whose metadata will be evaluated against the conditions. | ||||
|     :param _current_key: internal parameter, don't use. | ||||
|     :return: True if the document's metadata matches the filter conditions, False otherwise. | ||||
|     """ | ||||
|     if isinstance(conditions, dict): | ||||
|         # Check for malformed filters, like {"name": {"year": "2020"}} | ||||
|         if _current_key and any(key not in RESERVED_KEYS for key in conditions.keys()): | ||||
|             raise FilterError( | ||||
|                 f"This filter ({{{_current_key}: {conditions}}}) seems to be malformed. " | ||||
|                 "Comparisons between dictionaries are not currently supported. " | ||||
|                 "Check the documentation to learn more about filters syntax." | ||||
|             ) | ||||
| def _comparison_condition(condition: Dict[str, Any], document: Document) -> bool: | ||||
|     if "field" not in condition: | ||||
|         # 'field' key is only found in comparison dictionaries. | ||||
|         # We assume this is a logic dictionary since it's not present. | ||||
|         return _logic_condition(condition, document) | ||||
|     field: str = condition["field"] | ||||
| 
 | ||||
|         if len(conditions.keys()) > 1: | ||||
|             # The default operation for a list of sibling conditions is $and | ||||
|             return and_operation(conditions=_list_conditions(conditions), document=document, _current_key=_current_key) | ||||
|     if "operator" not in condition: | ||||
|         msg = f"'operator' key missing in {condition}" | ||||
|         raise FilterError(msg) | ||||
|     if "value" not in condition: | ||||
|         msg = f"'value' key missing in {condition}" | ||||
|         raise FilterError(msg) | ||||
| 
 | ||||
|         field_key, field_value = list(conditions.items())[0] | ||||
| 
 | ||||
|         # Nested logical statement ($and, $or, $not) | ||||
|         if field_key in LOGICAL_STATEMENTS.keys(): | ||||
|             return LOGICAL_STATEMENTS[field_key]( | ||||
|                 conditions=_list_conditions(field_value), document=document, _current_key=_current_key | ||||
|             ) | ||||
| 
 | ||||
|         # A comparison operator ($eq, $in, $gte, ...) | ||||
|         if field_key in OPERATORS.keys(): | ||||
|             if not _current_key: | ||||
|                 raise FilterError( | ||||
|                     "Filters can't start with an operator like $eq and $in. You have to specify the field name first. " | ||||
|                     "See the examples in the documentation." | ||||
|                 ) | ||||
|             return OPERATORS[field_key](fields=document.to_dict(), field_name=_current_key, value=field_value) | ||||
| 
 | ||||
|         # Otherwise fall back to the defaults | ||||
|         conditions = _list_conditions(field_value) | ||||
|         _current_key = field_key | ||||
| 
 | ||||
|     # Defaults for implicit filters | ||||
|     if isinstance(conditions, list): | ||||
|         if all(isinstance(cond, dict) for cond in conditions): | ||||
|             # The default operation for a list of sibling conditions is $and | ||||
|             return and_operation(conditions=_list_conditions(conditions), document=document, _current_key=_current_key) | ||||
|     if "." in field: | ||||
|         # Handles fields formatted like so: | ||||
|         # 'meta.person.name' | ||||
|         parts = field.split(".") | ||||
|         document_value = getattr(document, parts[0]) | ||||
|         for part in parts[1:]: | ||||
|             if part not in document_value: | ||||
|                 # If a field is not found we treat it as None | ||||
|                 document_value = None | ||||
|                 break | ||||
|             document_value = document_value[part] | ||||
|     elif field not in [f.name for f in fields(document)]: | ||||
|         # Converted legacy filters don't add the `meta.` prefix, so we assume | ||||
|         # that all filter fields that are not actual fields in Document are converted | ||||
|         # filters. | ||||
|         # | ||||
|         # We handle this to avoid breaking compatibility with converted legacy filters. | ||||
|         # This will be removed as soon as we stop supporting legacy filters. | ||||
|         document_value = document.meta.get(field) | ||||
|     else: | ||||
|             # The default operator for a {key: [value1, value2]} filter is $in | ||||
|             return in_operation(fields=document.to_dict(), field_name=_current_key, value=conditions) | ||||
| 
 | ||||
|     if _current_key: | ||||
|         # The default operator for a {key: value} filter is $eq | ||||
|         return eq_operation(fields=document.to_dict(), field_name=_current_key, value=conditions) | ||||
| 
 | ||||
|     raise FilterError("Filters must be dictionaries or lists. See the examples in the documentation.") | ||||
| 
 | ||||
| 
 | ||||
| def _list_conditions(conditions: Any) -> List[Any]: | ||||
|     """ | ||||
|     Make sure all nested conditions are not dictionaries or single values, but always lists. | ||||
| 
 | ||||
|     :param conditions: the conditions to transform into a list | ||||
|     :returns: a list of filters | ||||
|     """ | ||||
|     if isinstance(conditions, list): | ||||
|         return conditions | ||||
|     if isinstance(conditions, dict): | ||||
|         return [{key: value} for key, value in conditions.items()] | ||||
|     return [conditions] | ||||
|         document_value = getattr(document, field) | ||||
|     operator: str = condition["operator"] | ||||
|     filter_value: Any = condition["value"] | ||||
|     return COMPARISON_OPERATORS[operator](filter_value=filter_value, document_value=document_value) | ||||
| 
 | ||||
| 
 | ||||
| def convert(filters: Dict[str, Any]) -> Dict[str, Any]: | ||||
|  | ||||
							
								
								
									
										87
									
								
								releasenotes/notes/rework-filters-1bb103d196a1912b.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								releasenotes/notes/rework-filters-1bb103d196a1912b.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,87 @@ | ||||
| --- | ||||
| prelude: > | ||||
|   With proposal [#6001](https://github.com/deepset-ai/haystack/pull/6001) we introduced a better specification to declare filters in Haystack 2.x. | ||||
|   The new syntax is a bit more verbose but less confusing and ambiguous as there are no implicit operators. | ||||
|   This will simplify conversion from this common syntax to a Document Store specific filtering logic, so it will ease | ||||
|   development of new Document Store. | ||||
|   Since everything must be declared explicitly it will also make it easier for user to understand the filters just | ||||
|   by reading them. | ||||
| 
 | ||||
|   The full specification is as follow. | ||||
| 
 | ||||
|   --- | ||||
| 
 | ||||
|   Filters top level must be a dictionary. | ||||
| 
 | ||||
|   There are two types of dictionaries: | ||||
| 
 | ||||
|   - Comparison | ||||
|   - Logic | ||||
| 
 | ||||
|   Top level can be either be a Comparison or Logic dictionary. | ||||
| 
 | ||||
|   Comparison dictionaries must contain the keys: | ||||
| 
 | ||||
|   - `field` | ||||
|   - `operator` | ||||
|   - `value` | ||||
| 
 | ||||
|   Logic dictionaries must contain the keys: | ||||
| 
 | ||||
|   - `operator` | ||||
|   - `conditions` | ||||
| 
 | ||||
|   `conditions` key must be a list of dictionaries, either Comparison or Logic. | ||||
| 
 | ||||
|   `operator` values in Comparison dictionaries must be: | ||||
| 
 | ||||
|   - `==` | ||||
|   - `!=` | ||||
|   - `>` | ||||
|   - `>=` | ||||
|   - `<` | ||||
|   - `<=` | ||||
|   - `in` | ||||
|   - `not in` | ||||
| 
 | ||||
|   `operator` values in Logic dictionaries must be: | ||||
| 
 | ||||
|   - `NOT` | ||||
|   - `OR` | ||||
|   - `AND` | ||||
| 
 | ||||
|   --- | ||||
| 
 | ||||
|   A simple filter: | ||||
| 
 | ||||
|   ```python | ||||
|   filters = {"field": "meta.type", "operator": "==", "value": "article"} | ||||
|   ``` | ||||
| 
 | ||||
|   A more complex filter: | ||||
|   ```python | ||||
|   filters = { | ||||
|       "operator": "AND", | ||||
|       "conditions": [ | ||||
|           {"field": "meta.type", "operator": "==", "value": "article"}, | ||||
|           {"field": "meta.date", "operator": ">=", "value": 1420066800}, | ||||
|           {"field": "meta.date", "operator": "<", "value": 1609455600}, | ||||
|           {"field": "meta.rating", "operator": ">=", "value": 3}, | ||||
|           { | ||||
|               "operator": "OR", | ||||
|               "conditions": [ | ||||
|                   {"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]}, | ||||
|                   {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, | ||||
|               ], | ||||
|           }, | ||||
|       ], | ||||
|   } | ||||
|   ``` | ||||
| 
 | ||||
|   --- | ||||
| 
 | ||||
|   To avoid causing too much disruption for users using legacy filters we'll keep supporting them for the time being. | ||||
|   We also provide a utility `convert` function for developers implementing their Document Store to do the same. | ||||
| preview: | ||||
|   - | | ||||
|     Refactored `InMemoryDocumentStore` and `MetadataRouter` filtering logic to support new filters declaration. | ||||
| @ -8,8 +8,20 @@ class TestMetadataRouter: | ||||
|     @pytest.mark.unit | ||||
|     def test_run(self): | ||||
|         rules = { | ||||
|             "edge_1": {"created_at": {"$gte": "2023-01-01", "$lt": "2023-04-01"}}, | ||||
|             "edge_2": {"created_at": {"$gte": "2023-04-01", "$lt": "2023-07-01"}}, | ||||
|             "edge_1": { | ||||
|                 "operator": "AND", | ||||
|                 "conditions": [ | ||||
|                     {"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"}, | ||||
|                     {"field": "meta.created_at", "operator": "<", "value": "2023-04-01"}, | ||||
|                 ], | ||||
|             }, | ||||
|             "edge_2": { | ||||
|                 "operator": "AND", | ||||
|                 "conditions": [ | ||||
|                     {"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"}, | ||||
|                     {"field": "meta.created_at", "operator": "<", "value": "2023-07-01"}, | ||||
|                 ], | ||||
|             }, | ||||
|         } | ||||
|         router = MetadataRouter(rules=rules) | ||||
|         documents = [ | ||||
|  | ||||
| @ -146,10 +146,6 @@ class TestMemoryDocumentStore(DocumentStoreBaseTests):  # pylint: disable=R0904 | ||||
|         results = document_store.bm25_retrieval(query="Python", top_k=1) | ||||
|         assert results[0].content == "Python is a popular programming language" | ||||
| 
 | ||||
|     @pytest.mark.skip(reason="Filter is not working properly, see https://github.com/deepset-ai/haystack/issues/6153") | ||||
|     def test_eq_filter_embedding(self, document_store: InMemoryDocumentStore, filterable_docs): | ||||
|         pass | ||||
| 
 | ||||
|     # Test a query, add a new document and make sure results are appropriately updated | ||||
|     @pytest.mark.unit | ||||
|     def test_bm25_retrieval_with_updated_docs(self, document_store: InMemoryDocumentStore): | ||||
|  | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Silvano Cerza
						Silvano Cerza