chore: Remove all references to old filter syntax (#8342)

* Remove all references to old filter syntax

* More removals

* Lint

* Do not remove test_filter_retriever.py

* Add reno note

* Update ValueError text to match text in haystack-core-integrations
This commit is contained in:
Vladimir Blagojevic 2024-09-12 16:28:31 +02:00 committed by GitHub
parent 672bcf7e03
commit 7e9f153e78
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 20 additions and 199 deletions

View File

@ -5,7 +5,7 @@
from typing import Dict, List
from haystack import Document, component
from haystack.utils.filters import convert, document_matches_filter
from haystack.utils.filters import document_matches_filter
@component
@ -96,8 +96,10 @@ class MetadataRouter:
cur_document_matched = False
for edge, rule in self.rules.items():
if "operator" not in rule:
# Must be a legacy filter, convert it
rule = convert(rule)
raise ValueError(
"Invalid filter syntax. "
"See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
)
if document_matches_filter(rule, document):
output[edge].append(document)
cur_document_matched = True

View File

@ -18,7 +18,7 @@ from haystack.dataclasses import Document
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
from haystack.document_stores.types import DuplicatePolicy
from haystack.utils import expit
from haystack.utils.filters import convert, document_matches_filter
from haystack.utils.filters import document_matches_filter
logger = logging.getLogger(__name__)
@ -395,7 +395,10 @@ class InMemoryDocumentStore:
"""
if filters:
if "operator" not in filters and "conditions" not in filters:
filters = convert(filters)
raise ValueError(
"Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering "
"for details."
)
return [doc for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)]
return list(self.storage.values())
@ -502,7 +505,10 @@ class InMemoryDocumentStore:
}
if filters:
if "operator" not in filters:
filters = convert(filters)
raise ValueError(
"Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering "
"for details."
)
filters = {"operator": "AND", "conditions": [content_type_filter, filters]}
else:
filters = content_type_filter

View File

@ -2,10 +2,9 @@
#
# SPDX-License-Identifier: Apache-2.0
import warnings
from dataclasses import fields
from datetime import datetime
from typing import Any, Dict, List, Union
from typing import Any, Dict, List
import pandas as pd
@ -177,145 +176,3 @@ def _comparison_condition(condition: Dict[str, Any], document: Document) -> bool
operator: str = condition["operator"]
filter_value: Any = condition["value"]
return COMPARISON_OPERATORS[operator](filter_value=filter_value, document_value=document_value)
def convert(filters: Dict[str, Any]) -> Dict[str, Any]:
"""
Convert a filter declared using the legacy style into the new style.
This is mostly meant to ease migration from Haystack 1.x to 2.x for developers
of Document Stores and Components that use filters.
This function doesn't verify if `filters` are declared using the legacy style.
Example usage:
```python
legacy_filter = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}},
}
}
assert convert(legacy_filter) == {
"operator": "AND",
"conditions": [
{"field": "type", "operator": "==", "value": "article"},
{"field": "date", "operator": ">=", "value": "2015-01-01"},
{"field": "date", "operator": "<", "value": "2021-01-01"},
{"field": "rating", "operator": ">=", "value": 3},
{
"operator": "OR",
"conditions": [
{"field": "genre", "operator": "in", "value": ["economy", "politics"]},
{"field": "publisher", "operator": "==", "value": "nytimes"},
],
},
],
}
```
"""
warnings.warn(
"The use of legacy (Haystack 1.x) filters is deprecated and will be removed in the future. "
"Please use the new filter style as described in the documentation - "
"https://docs.haystack.deepset.ai/docs/metadata-filtering",
DeprecationWarning,
)
if not isinstance(filters, dict):
msg = f"Can't convert filters from type '{type(filters)}'"
raise ValueError(msg)
converted = _internal_convert(filters)
if "conditions" not in converted:
# This is done to handle a corner case when filter is really simple like so:
# {"text": "A Foo Document 1"}
# The root '$and' operator is implicit so the conversion doesn't handle
# it and it must be added explicitly like so.
# This only happens for simple filters like the one above.
return {"operator": "AND", "conditions": [converted]}
return converted
def _internal_convert(filters: Union[List[Any], Dict[str, Any]], previous_key=None) -> Any:
"""
Recursively convert filters from legacy to new style.
"""
conditions = []
if isinstance(filters, list) and (result := _handle_list(filters, previous_key)) is not None:
return result
if not isinstance(filters, dict):
return _handle_non_dict(filters, previous_key)
for key, value in filters.items():
if (
previous_key is not None
and previous_key not in ALL_LEGACY_OPERATORS_MAPPING
and key not in ALL_LEGACY_OPERATORS_MAPPING
):
msg = f"This filter ({filters}) seems to be malformed."
raise FilterError(msg)
if key not in ALL_LEGACY_OPERATORS_MAPPING:
converted = _internal_convert(value, previous_key=key)
if isinstance(converted, list):
conditions.extend(converted)
else:
conditions.append(converted)
elif key in LEGACY_LOGICAL_OPERATORS_MAPPING:
if previous_key not in ALL_LEGACY_OPERATORS_MAPPING and isinstance(value, list):
converted = [_internal_convert({previous_key: v}) for v in value]
conditions.append({"operator": ALL_LEGACY_OPERATORS_MAPPING[key], "conditions": converted})
else:
converted = _internal_convert(value, previous_key=key)
if key == "$not" and type(converted) not in [dict, list]:
# This handles a corner when '$not' is used like this:
# '{"page": {"$not": 102}}'
# Without this check we would miss the implicit '$eq'
converted = {"field": previous_key, "operator": "==", "value": value}
if not isinstance(converted, list):
converted = [converted]
conditions.append({"operator": ALL_LEGACY_OPERATORS_MAPPING[key], "conditions": converted})
elif key in LEGACY_COMPARISON_OPERATORS_MAPPING:
conditions.append({"field": previous_key, "operator": ALL_LEGACY_OPERATORS_MAPPING[key], "value": value})
if len(conditions) == 1:
return conditions[0]
if previous_key is None:
return {"operator": "AND", "conditions": conditions}
return conditions
def _handle_list(filters, previous_key):
if previous_key in LEGACY_LOGICAL_OPERATORS_MAPPING:
return [_internal_convert(f) for f in filters]
elif previous_key not in LEGACY_COMPARISON_OPERATORS_MAPPING:
return {"field": previous_key, "operator": "in", "value": filters}
return None
def _handle_non_dict(filters, previous_key):
if previous_key not in ALL_LEGACY_OPERATORS_MAPPING:
return {"field": previous_key, "operator": "==", "value": filters}
return filters
# Operator mappings from legacy style to new one
LEGACY_LOGICAL_OPERATORS_MAPPING = {"$and": "AND", "$or": "OR", "$not": "NOT"}
LEGACY_COMPARISON_OPERATORS_MAPPING = {
"$eq": "==",
"$ne": "!=",
"$gt": ">",
"$gte": ">=",
"$lt": "<",
"$lte": "<=",
"$in": "in",
"$nin": "not in",
}
ALL_LEGACY_OPERATORS_MAPPING = {**LEGACY_LOGICAL_OPERATORS_MAPPING, **LEGACY_COMPARISON_OPERATORS_MAPPING}

View File

@ -0,0 +1,4 @@
---
upgrade:
- |
The legacy filter syntax support has been completely removed. Users need to use the new filter syntax. See the [docs](https://docs.haystack.deepset.ai/docs/metadata-filtering) for more details.

View File

@ -274,36 +274,6 @@ class TestMemoryDocumentStore(DocumentStoreBaseTests): # pylint: disable=R0904
results = document_store.bm25_retrieval(query="doesn't matter, top_k is 10", top_k=10)
assert len(results) == 0
def test_bm25_retrieval_with_filters(self, document_store: InMemoryDocumentStore):
selected_document = Document(content="Java is, well...", meta={"selected": True})
docs = [Document(), selected_document, Document(content="Bird watching")]
document_store.write_documents(docs)
results = document_store.bm25_retrieval(query="Java", top_k=10, filters={"selected": True})
assert len(results) == 1
assert results[0].id == selected_document.id
def test_bm25_retrieval_with_filters_keeps_default_filters(self, document_store: InMemoryDocumentStore):
docs = [Document(meta={"selected": True}), Document(content="Gardening"), Document(content="Bird watching")]
document_store.write_documents(docs)
results = document_store.bm25_retrieval(query="Java", top_k=10, filters={"selected": True})
assert len(results) == 0
def test_bm25_retrieval_with_filters_on_text_or_dataframe(self, document_store: InMemoryDocumentStore):
document = Document(dataframe=pd.DataFrame({"language": ["Python", "Java"], "use": ["Data Science", "Web"]}))
docs = [Document(), Document(content="Gardening"), Document(content="Bird watching"), document]
document_store.write_documents(docs)
results = document_store.bm25_retrieval(query="Java", top_k=10, filters={"content": None})
assert len(results) == 1
assert results[0].id == document.id
def test_bm25_retrieval_with_documents_with_mixed_content(self, document_store: InMemoryDocumentStore):
double_document = Document(content="Gardening is a hobby", embedding=[1.0, 2.0, 3.0])
docs = [Document(embedding=[1.0, 2.0, 3.0]), double_document, Document(content="Bird watching")]
document_store.write_documents(docs)
results = document_store.bm25_retrieval(query="Gardening", top_k=10, filters={"embedding": {"$not": None}})
assert len(results) == 1
assert results[0].id == double_document.id
def test_embedding_retrieval(self):
docstore = InMemoryDocumentStore(embedding_similarity_function="cosine")
# Tests if the embedding retrieval method returns the correct document based on the input query embedding.

View File

@ -6,7 +6,7 @@ import pandas as pd
from haystack import Document
from haystack.errors import FilterError
from haystack.utils.filters import convert, document_matches_filter
from haystack.utils.filters import document_matches_filter
document_matches_filter_data = [
# == operator params
@ -708,21 +708,3 @@ filters_data = [
id="Explicit $not with implicit $eq",
),
]
@pytest.mark.parametrize("old_style, new_style", filters_data)
def test_convert(old_style, new_style):
assert convert(old_style) == new_style
def test_convert_with_incorrect_input_type():
with pytest.raises(ValueError):
convert("some string")
def test_convert_with_incorrect_filter_nesting():
with pytest.raises(FilterError):
convert({"number": {"page": "100"}})
with pytest.raises(FilterError):
convert({"number": {"page": {"chapter": "intro"}}})