feat: Implement function to convert legacy filters to new style (#6314)

* Implement function to convert legacy filters to new style

* Reduce return statements in conversion to fix linting

* Move convert function in different module

* Fix typos in docstrings

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

---------

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
This commit is contained in:
Silvano Cerza 2023-11-20 13:00:05 +01:00 committed by GitHub
parent 497299c27a
commit 83c245db74
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 317 additions and 1 deletions

View File

@ -292,3 +292,121 @@ def _list_conditions(conditions: Any) -> List[Any]:
if isinstance(conditions, dict):
return [{key: value} for key, value in conditions.items()]
return [conditions]
def convert(filters: Dict[str, Any]) -> Dict[str, Any]:
"""
Convert a filter declared using the legacy style into the new style.
This is mostly meant to ease migration from Haystack 1.x to 2.x for developers
of Document Stores and Components that use filters.
This function doesn't verify if `filters` are declared using the legacy style.
Example usage:
```python
legacy_filter = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}},
}
}
assert convert(legacy_filter) == {
"operator": "AND",
"conditions": [
{"field": "type", "operator": "==", "value": "article"},
{"field": "date", "operator": ">=", "value": "2015-01-01"},
{"field": "date", "operator": "<", "value": "2021-01-01"},
{"field": "rating", "operator": ">=", "value": 3},
{
"operator": "OR",
"conditions": [
{"field": "genre", "operator": "in", "value": ["economy", "politics"]},
{"field": "publisher", "operator": "==", "value": "nytimes"},
],
},
],
}
```
"""
converted = _internal_convert(filters)
if "conditions" not in converted:
# This is done to handle a corner case when filter is really simple like so:
# {"text": "A Foo Document 1"}
# The root '$and' operator is implicit so the conversion doesn't handle
# it and it must be added explicitly like so.
# This only happens for simple filters like the one above.
return {"operator": "AND", "conditions": [converted]}
return converted
def _internal_convert(filters: Union[List[Any], Dict[str, Any]], previous_key=None) -> Any:
"""
Recursively convert filters from legacy to new style.
"""
conditions = []
if isinstance(filters, list) and (result := _handle_list(filters, previous_key)) is not None:
return result
if not isinstance(filters, dict):
return _handle_non_dict(filters, previous_key)
for key, value in filters.items():
if key not in ALL_OPERATORS:
converted = _internal_convert(value, previous_key=key)
if isinstance(converted, list):
conditions.extend(converted)
else:
conditions.append(converted)
elif key in LOGIC_OPERATORS:
if previous_key not in ALL_OPERATORS and isinstance(value, list):
converted = [_internal_convert({previous_key: v}) for v in value]
conditions.append({"operator": ALL_OPERATORS[key], "conditions": converted})
else:
converted = _internal_convert(value, previous_key=key)
if not isinstance(converted, list):
converted = [converted]
conditions.append({"operator": ALL_OPERATORS[key], "conditions": converted})
elif key in COMPARISON_OPERATORS:
conditions.append({"field": previous_key, "operator": ALL_OPERATORS[key], "value": value})
if len(conditions) == 1:
return conditions[0]
if previous_key is None:
return {"operator": "AND", "conditions": conditions}
return conditions
def _handle_list(filters, previous_key):
if previous_key in LOGIC_OPERATORS:
return [_internal_convert(f) for f in filters]
elif previous_key not in COMPARISON_OPERATORS:
return {"field": previous_key, "operator": "in", "value": filters}
return None
def _handle_non_dict(filters, previous_key):
if previous_key not in ALL_OPERATORS:
return {"field": previous_key, "operator": "==", "value": filters}
return filters
# Operator mappings from legacy style to new one
LOGIC_OPERATORS = {"$and": "AND", "$or": "OR", "$not": "NOT"}
COMPARISON_OPERATORS = {
"$eq": "==",
"$ne": "!=",
"$gt": ">",
"$gte": ">=",
"$lt": "<",
"$lte": "<=",
"$in": "in",
"$nin": "not in",
}
ALL_OPERATORS = {**LOGIC_OPERATORS, **COMPARISON_OPERATORS}

View File

@ -0,0 +1,42 @@
---
prelude: >
Following the proposal to introduce a new way of declaring filters
in Haystack 2.x for Document Stores and all Components that use them,
we introduce a utility function to convert the legacy style to the new style.
This will make life easier for developers when implementing new Document Stores
as it will only be necessary for filtering logic for the new style filters, as
conversion will be completely handled by the utility function.
An example usage would be something similar to this:
```python
legacy_filter = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}},
}
}
assert convert(legacy_filter) == {
"operator": "AND",
"conditions": [
{"field": "type", "operator": "==", "value": "article"},
{"field": "date", "operator": ">=", "value": "2015-01-01"},
{"field": "date", "operator": "<", "value": "2021-01-01"},
{"field": "rating", "operator": ">=", "value": 3},
{
"operator": "OR",
"conditions": [
{"field": "genre", "operator": "in", "value": ["economy", "politics"]},
{"field": "publisher", "operator": "==", "value": "nytimes"},
],
},
],
}
```
For more information on the new filters technical specification see [proposal #6001](https://github.com/deepset-ai/haystack/blob/main/proposals/text/6001-document-store-filter-rework.md)
preview:
- |
Introduce a function to convert legacy filters to the new style

View File

@ -4,7 +4,7 @@ import numpy as np
from haystack.preview import Document
from haystack.preview.errors import FilterError
from haystack.preview.utils.filters import document_matches_filter
from haystack.preview.utils.filters import convert, document_matches_filter
class TestFilterUtils: # pylint: disable=R0904
@ -503,3 +503,159 @@ class TestFilterUtils: # pylint: disable=R0904
document = Document(meta={"age": 17})
filter = {"age": {"$not": {"$gt": 18}}}
assert document_matches_filter(filter, document)
filters_data = [
pytest.param(
{
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}},
}
},
{
"operator": "AND",
"conditions": [
{"field": "type", "operator": "==", "value": "article"},
{"field": "date", "operator": ">=", "value": "2015-01-01"},
{"field": "date", "operator": "<", "value": "2021-01-01"},
{"field": "rating", "operator": ">=", "value": 3},
{
"operator": "OR",
"conditions": [
{"field": "genre", "operator": "in", "value": ["economy", "politics"]},
{"field": "publisher", "operator": "==", "value": "nytimes"},
],
},
],
},
id="All operators explicit",
),
pytest.param(
{
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {"genre": ["economy", "politics"], "publisher": "nytimes"},
},
{
"operator": "AND",
"conditions": [
{"field": "type", "operator": "==", "value": "article"},
{"field": "date", "operator": ">=", "value": "2015-01-01"},
{"field": "date", "operator": "<", "value": "2021-01-01"},
{"field": "rating", "operator": ">=", "value": 3},
{
"operator": "OR",
"conditions": [
{"field": "genre", "operator": "in", "value": ["economy", "politics"]},
{"field": "publisher", "operator": "==", "value": "nytimes"},
],
},
],
},
id="Root $and implicit",
),
pytest.param(
{
"$or": [
{"Type": "News Paper", "Date": {"$lt": "2019-01-01"}},
{"Type": "Blog Post", "Date": {"$gte": "2019-01-01"}},
]
},
{
"operator": "OR",
"conditions": [
{
"operator": "AND",
"conditions": [
{"field": "Type", "operator": "==", "value": "News Paper"},
{"field": "Date", "operator": "<", "value": "2019-01-01"},
],
},
{
"operator": "AND",
"conditions": [
{"field": "Type", "operator": "==", "value": "Blog Post"},
{"field": "Date", "operator": ">=", "value": "2019-01-01"},
],
},
],
},
id="Root $or with list and multiple comparisons",
),
pytest.param(
{"text": "A Foo Document 1"},
{"operator": "AND", "conditions": [{"field": "text", "operator": "==", "value": "A Foo Document 1"}]},
id="Implicit root $and and field $eq",
),
pytest.param(
{"$or": {"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}, "number": {"$lt": 1.0}}},
{
"operator": "OR",
"conditions": [
{
"operator": "OR",
"conditions": [
{"field": "name", "operator": "==", "value": "name_0"},
{"field": "name", "operator": "==", "value": "name_1"},
],
},
{"field": "number", "operator": "<", "value": 1.0},
],
},
id="Root $or with dict and field $or with list",
),
pytest.param(
{"number": {"$lte": 2, "$gte": 0}, "name": ["name_0", "name_1"]},
{
"operator": "AND",
"conditions": [
{"field": "number", "operator": "<=", "value": 2},
{"field": "number", "operator": ">=", "value": 0},
{"field": "name", "operator": "in", "value": ["name_0", "name_1"]},
],
},
id="Implicit $and and field $in",
),
pytest.param(
{"number": {"$and": [{"$lte": 2}, {"$gte": 0}]}},
{
"operator": "AND",
"conditions": [
{"field": "number", "operator": "<=", "value": 2},
{"field": "number", "operator": ">=", "value": 0},
],
},
id="Implicit root $and and field $and with list",
),
pytest.param(
{
"$not": {
"number": {"$lt": 1.0},
"$and": {"name": {"$in": ["name_0", "name_1"]}, "$not": {"chapter": {"$eq": "intro"}}},
}
},
{
"operator": "NOT",
"conditions": [
{"field": "number", "operator": "<", "value": 1.0},
{
"operator": "AND",
"conditions": [
{"field": "name", "operator": "in", "value": ["name_0", "name_1"]},
{"operator": "NOT", "conditions": [{"field": "chapter", "operator": "==", "value": "intro"}]},
],
},
],
},
id="Root explicit $not",
),
]
@pytest.mark.parametrize("old_style, new_style", filters_data)
def test_convert(old_style, new_style):
assert convert(old_style) == new_style