diff --git a/haystack/preview/utils/filters.py b/haystack/preview/utils/filters.py index 824908664..e5b464c86 100644 --- a/haystack/preview/utils/filters.py +++ b/haystack/preview/utils/filters.py @@ -292,3 +292,121 @@ def _list_conditions(conditions: Any) -> List[Any]: if isinstance(conditions, dict): return [{key: value} for key, value in conditions.items()] return [conditions] + + +def convert(filters: Dict[str, Any]) -> Dict[str, Any]: + """ + Convert a filter declared using the legacy style into the new style. + This is mostly meant to ease migration from Haystack 1.x to 2.x for developers + of Document Stores and Components that use filters. + + This function doesn't verify if `filters` are declared using the legacy style. + + Example usage: + ```python + legacy_filter = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}}, + } + } + assert convert(legacy_filter) == { + "operator": "AND", + "conditions": [ + {"field": "type", "operator": "==", "value": "article"}, + {"field": "date", "operator": ">=", "value": "2015-01-01"}, + {"field": "date", "operator": "<", "value": "2021-01-01"}, + {"field": "rating", "operator": ">=", "value": 3}, + { + "operator": "OR", + "conditions": [ + {"field": "genre", "operator": "in", "value": ["economy", "politics"]}, + {"field": "publisher", "operator": "==", "value": "nytimes"}, + ], + }, + ], + } + ``` + """ + converted = _internal_convert(filters) + if "conditions" not in converted: + # This is done to handle a corner case when filter is really simple like so: + # {"text": "A Foo Document 1"} + # The root '$and' operator is implicit so the conversion doesn't handle + # it and it must be added explicitly like so. + # This only happens for simple filters like the one above. + return {"operator": "AND", "conditions": [converted]} + return converted + + +def _internal_convert(filters: Union[List[Any], Dict[str, Any]], previous_key=None) -> Any: + """ + Recursively convert filters from legacy to new style. + """ + conditions = [] + + if isinstance(filters, list) and (result := _handle_list(filters, previous_key)) is not None: + return result + + if not isinstance(filters, dict): + return _handle_non_dict(filters, previous_key) + + for key, value in filters.items(): + if key not in ALL_OPERATORS: + converted = _internal_convert(value, previous_key=key) + if isinstance(converted, list): + conditions.extend(converted) + else: + conditions.append(converted) + elif key in LOGIC_OPERATORS: + if previous_key not in ALL_OPERATORS and isinstance(value, list): + converted = [_internal_convert({previous_key: v}) for v in value] + conditions.append({"operator": ALL_OPERATORS[key], "conditions": converted}) + else: + converted = _internal_convert(value, previous_key=key) + if not isinstance(converted, list): + converted = [converted] + conditions.append({"operator": ALL_OPERATORS[key], "conditions": converted}) + elif key in COMPARISON_OPERATORS: + conditions.append({"field": previous_key, "operator": ALL_OPERATORS[key], "value": value}) + + if len(conditions) == 1: + return conditions[0] + + if previous_key is None: + return {"operator": "AND", "conditions": conditions} + + return conditions + + +def _handle_list(filters, previous_key): + if previous_key in LOGIC_OPERATORS: + return [_internal_convert(f) for f in filters] + elif previous_key not in COMPARISON_OPERATORS: + return {"field": previous_key, "operator": "in", "value": filters} + return None + + +def _handle_non_dict(filters, previous_key): + if previous_key not in ALL_OPERATORS: + return {"field": previous_key, "operator": "==", "value": filters} + return filters + + +# Operator mappings from legacy style to new one +LOGIC_OPERATORS = {"$and": "AND", "$or": "OR", "$not": "NOT"} + +COMPARISON_OPERATORS = { + "$eq": "==", + "$ne": "!=", + "$gt": ">", + "$gte": ">=", + "$lt": "<", + "$lte": "<=", + "$in": "in", + "$nin": "not in", +} + +ALL_OPERATORS = {**LOGIC_OPERATORS, **COMPARISON_OPERATORS} diff --git a/releasenotes/notes/filters-converter-485cd24cf38407d0.yaml b/releasenotes/notes/filters-converter-485cd24cf38407d0.yaml new file mode 100644 index 000000000..49cca0882 --- /dev/null +++ b/releasenotes/notes/filters-converter-485cd24cf38407d0.yaml @@ -0,0 +1,42 @@ +--- +prelude: > + Following the proposal to introduce a new way of declaring filters + in Haystack 2.x for Document Stores and all Components that use them, + we introduce a utility function to convert the legacy style to the new style. + + This will make life easier for developers when implementing new Document Stores + as it will only be necessary for filtering logic for the new style filters, as + conversion will be completely handled by the utility function. + + An example usage would be something similar to this: + ```python + legacy_filter = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}}, + } + } + assert convert(legacy_filter) == { + "operator": "AND", + "conditions": [ + {"field": "type", "operator": "==", "value": "article"}, + {"field": "date", "operator": ">=", "value": "2015-01-01"}, + {"field": "date", "operator": "<", "value": "2021-01-01"}, + {"field": "rating", "operator": ">=", "value": 3}, + { + "operator": "OR", + "conditions": [ + {"field": "genre", "operator": "in", "value": ["economy", "politics"]}, + {"field": "publisher", "operator": "==", "value": "nytimes"}, + ], + }, + ], + } + ``` + + For more information on the new filters technical specification see [proposal #6001](https://github.com/deepset-ai/haystack/blob/main/proposals/text/6001-document-store-filter-rework.md) +preview: + - | + Introduce a function to convert legacy filters to the new style diff --git a/test/preview/utils/test_filters.py b/test/preview/utils/test_filters.py index 57bee4e96..076c83e57 100644 --- a/test/preview/utils/test_filters.py +++ b/test/preview/utils/test_filters.py @@ -4,7 +4,7 @@ import numpy as np from haystack.preview import Document from haystack.preview.errors import FilterError -from haystack.preview.utils.filters import document_matches_filter +from haystack.preview.utils.filters import convert, document_matches_filter class TestFilterUtils: # pylint: disable=R0904 @@ -503,3 +503,159 @@ class TestFilterUtils: # pylint: disable=R0904 document = Document(meta={"age": 17}) filter = {"age": {"$not": {"$gt": 18}}} assert document_matches_filter(filter, document) + + +filters_data = [ + pytest.param( + { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}}, + } + }, + { + "operator": "AND", + "conditions": [ + {"field": "type", "operator": "==", "value": "article"}, + {"field": "date", "operator": ">=", "value": "2015-01-01"}, + {"field": "date", "operator": "<", "value": "2021-01-01"}, + {"field": "rating", "operator": ">=", "value": 3}, + { + "operator": "OR", + "conditions": [ + {"field": "genre", "operator": "in", "value": ["economy", "politics"]}, + {"field": "publisher", "operator": "==", "value": "nytimes"}, + ], + }, + ], + }, + id="All operators explicit", + ), + pytest.param( + { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": {"genre": ["economy", "politics"], "publisher": "nytimes"}, + }, + { + "operator": "AND", + "conditions": [ + {"field": "type", "operator": "==", "value": "article"}, + {"field": "date", "operator": ">=", "value": "2015-01-01"}, + {"field": "date", "operator": "<", "value": "2021-01-01"}, + {"field": "rating", "operator": ">=", "value": 3}, + { + "operator": "OR", + "conditions": [ + {"field": "genre", "operator": "in", "value": ["economy", "politics"]}, + {"field": "publisher", "operator": "==", "value": "nytimes"}, + ], + }, + ], + }, + id="Root $and implicit", + ), + pytest.param( + { + "$or": [ + {"Type": "News Paper", "Date": {"$lt": "2019-01-01"}}, + {"Type": "Blog Post", "Date": {"$gte": "2019-01-01"}}, + ] + }, + { + "operator": "OR", + "conditions": [ + { + "operator": "AND", + "conditions": [ + {"field": "Type", "operator": "==", "value": "News Paper"}, + {"field": "Date", "operator": "<", "value": "2019-01-01"}, + ], + }, + { + "operator": "AND", + "conditions": [ + {"field": "Type", "operator": "==", "value": "Blog Post"}, + {"field": "Date", "operator": ">=", "value": "2019-01-01"}, + ], + }, + ], + }, + id="Root $or with list and multiple comparisons", + ), + pytest.param( + {"text": "A Foo Document 1"}, + {"operator": "AND", "conditions": [{"field": "text", "operator": "==", "value": "A Foo Document 1"}]}, + id="Implicit root $and and field $eq", + ), + pytest.param( + {"$or": {"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}, "number": {"$lt": 1.0}}}, + { + "operator": "OR", + "conditions": [ + { + "operator": "OR", + "conditions": [ + {"field": "name", "operator": "==", "value": "name_0"}, + {"field": "name", "operator": "==", "value": "name_1"}, + ], + }, + {"field": "number", "operator": "<", "value": 1.0}, + ], + }, + id="Root $or with dict and field $or with list", + ), + pytest.param( + {"number": {"$lte": 2, "$gte": 0}, "name": ["name_0", "name_1"]}, + { + "operator": "AND", + "conditions": [ + {"field": "number", "operator": "<=", "value": 2}, + {"field": "number", "operator": ">=", "value": 0}, + {"field": "name", "operator": "in", "value": ["name_0", "name_1"]}, + ], + }, + id="Implicit $and and field $in", + ), + pytest.param( + {"number": {"$and": [{"$lte": 2}, {"$gte": 0}]}}, + { + "operator": "AND", + "conditions": [ + {"field": "number", "operator": "<=", "value": 2}, + {"field": "number", "operator": ">=", "value": 0}, + ], + }, + id="Implicit root $and and field $and with list", + ), + pytest.param( + { + "$not": { + "number": {"$lt": 1.0}, + "$and": {"name": {"$in": ["name_0", "name_1"]}, "$not": {"chapter": {"$eq": "intro"}}}, + } + }, + { + "operator": "NOT", + "conditions": [ + {"field": "number", "operator": "<", "value": 1.0}, + { + "operator": "AND", + "conditions": [ + {"field": "name", "operator": "in", "value": ["name_0", "name_1"]}, + {"operator": "NOT", "conditions": [{"field": "chapter", "operator": "==", "value": "intro"}]}, + ], + }, + ], + }, + id="Root explicit $not", + ), +] + + +@pytest.mark.parametrize("old_style, new_style", filters_data) +def test_convert(old_style, new_style): + assert convert(old_style) == new_style