diff --git a/haystack/components/routers/metadata_router.py b/haystack/components/routers/metadata_router.py index 296a0755b..7c6c007d1 100644 --- a/haystack/components/routers/metadata_router.py +++ b/haystack/components/routers/metadata_router.py @@ -5,7 +5,7 @@ from typing import Dict, List from haystack import Document, component -from haystack.utils.filters import convert, document_matches_filter +from haystack.utils.filters import document_matches_filter @component @@ -96,8 +96,10 @@ class MetadataRouter: cur_document_matched = False for edge, rule in self.rules.items(): if "operator" not in rule: - # Must be a legacy filter, convert it - rule = convert(rule) + raise ValueError( + "Invalid filter syntax. " + "See https://docs.haystack.deepset.ai/docs/metadata-filtering for details." + ) if document_matches_filter(rule, document): output[edge].append(document) cur_document_matched = True diff --git a/haystack/document_stores/in_memory/document_store.py b/haystack/document_stores/in_memory/document_store.py index 4fd10e1cd..245c55f78 100644 --- a/haystack/document_stores/in_memory/document_store.py +++ b/haystack/document_stores/in_memory/document_store.py @@ -18,7 +18,7 @@ from haystack.dataclasses import Document from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy from haystack.utils import expit -from haystack.utils.filters import convert, document_matches_filter +from haystack.utils.filters import document_matches_filter logger = logging.getLogger(__name__) @@ -395,7 +395,10 @@ class InMemoryDocumentStore: """ if filters: if "operator" not in filters and "conditions" not in filters: - filters = convert(filters) + raise ValueError( + "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering " + "for details." + ) return [doc for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)] return list(self.storage.values()) @@ -502,7 +505,10 @@ class InMemoryDocumentStore: } if filters: if "operator" not in filters: - filters = convert(filters) + raise ValueError( + "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering " + "for details." + ) filters = {"operator": "AND", "conditions": [content_type_filter, filters]} else: filters = content_type_filter diff --git a/haystack/utils/filters.py b/haystack/utils/filters.py index ac25ff9bc..8cdf294ff 100644 --- a/haystack/utils/filters.py +++ b/haystack/utils/filters.py @@ -2,10 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -import warnings from dataclasses import fields from datetime import datetime -from typing import Any, Dict, List, Union +from typing import Any, Dict, List import pandas as pd @@ -177,145 +176,3 @@ def _comparison_condition(condition: Dict[str, Any], document: Document) -> bool operator: str = condition["operator"] filter_value: Any = condition["value"] return COMPARISON_OPERATORS[operator](filter_value=filter_value, document_value=document_value) - - -def convert(filters: Dict[str, Any]) -> Dict[str, Any]: - """ - Convert a filter declared using the legacy style into the new style. - - This is mostly meant to ease migration from Haystack 1.x to 2.x for developers - of Document Stores and Components that use filters. - - This function doesn't verify if `filters` are declared using the legacy style. - - Example usage: - ```python - legacy_filter = { - "$and": { - "type": {"$eq": "article"}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}}, - } - } - assert convert(legacy_filter) == { - "operator": "AND", - "conditions": [ - {"field": "type", "operator": "==", "value": "article"}, - {"field": "date", "operator": ">=", "value": "2015-01-01"}, - {"field": "date", "operator": "<", "value": "2021-01-01"}, - {"field": "rating", "operator": ">=", "value": 3}, - { - "operator": "OR", - "conditions": [ - {"field": "genre", "operator": "in", "value": ["economy", "politics"]}, - {"field": "publisher", "operator": "==", "value": "nytimes"}, - ], - }, - ], - } - ``` - """ - warnings.warn( - "The use of legacy (Haystack 1.x) filters is deprecated and will be removed in the future. " - "Please use the new filter style as described in the documentation - " - "https://docs.haystack.deepset.ai/docs/metadata-filtering", - DeprecationWarning, - ) - - if not isinstance(filters, dict): - msg = f"Can't convert filters from type '{type(filters)}'" - raise ValueError(msg) - - converted = _internal_convert(filters) - if "conditions" not in converted: - # This is done to handle a corner case when filter is really simple like so: - # {"text": "A Foo Document 1"} - # The root '$and' operator is implicit so the conversion doesn't handle - # it and it must be added explicitly like so. - # This only happens for simple filters like the one above. - return {"operator": "AND", "conditions": [converted]} - return converted - - -def _internal_convert(filters: Union[List[Any], Dict[str, Any]], previous_key=None) -> Any: - """ - Recursively convert filters from legacy to new style. - """ - conditions = [] - - if isinstance(filters, list) and (result := _handle_list(filters, previous_key)) is not None: - return result - - if not isinstance(filters, dict): - return _handle_non_dict(filters, previous_key) - - for key, value in filters.items(): - if ( - previous_key is not None - and previous_key not in ALL_LEGACY_OPERATORS_MAPPING - and key not in ALL_LEGACY_OPERATORS_MAPPING - ): - msg = f"This filter ({filters}) seems to be malformed." - raise FilterError(msg) - if key not in ALL_LEGACY_OPERATORS_MAPPING: - converted = _internal_convert(value, previous_key=key) - if isinstance(converted, list): - conditions.extend(converted) - else: - conditions.append(converted) - elif key in LEGACY_LOGICAL_OPERATORS_MAPPING: - if previous_key not in ALL_LEGACY_OPERATORS_MAPPING and isinstance(value, list): - converted = [_internal_convert({previous_key: v}) for v in value] - conditions.append({"operator": ALL_LEGACY_OPERATORS_MAPPING[key], "conditions": converted}) - else: - converted = _internal_convert(value, previous_key=key) - if key == "$not" and type(converted) not in [dict, list]: - # This handles a corner when '$not' is used like this: - # '{"page": {"$not": 102}}' - # Without this check we would miss the implicit '$eq' - converted = {"field": previous_key, "operator": "==", "value": value} - if not isinstance(converted, list): - converted = [converted] - conditions.append({"operator": ALL_LEGACY_OPERATORS_MAPPING[key], "conditions": converted}) - elif key in LEGACY_COMPARISON_OPERATORS_MAPPING: - conditions.append({"field": previous_key, "operator": ALL_LEGACY_OPERATORS_MAPPING[key], "value": value}) - - if len(conditions) == 1: - return conditions[0] - - if previous_key is None: - return {"operator": "AND", "conditions": conditions} - - return conditions - - -def _handle_list(filters, previous_key): - if previous_key in LEGACY_LOGICAL_OPERATORS_MAPPING: - return [_internal_convert(f) for f in filters] - elif previous_key not in LEGACY_COMPARISON_OPERATORS_MAPPING: - return {"field": previous_key, "operator": "in", "value": filters} - return None - - -def _handle_non_dict(filters, previous_key): - if previous_key not in ALL_LEGACY_OPERATORS_MAPPING: - return {"field": previous_key, "operator": "==", "value": filters} - return filters - - -# Operator mappings from legacy style to new one -LEGACY_LOGICAL_OPERATORS_MAPPING = {"$and": "AND", "$or": "OR", "$not": "NOT"} - -LEGACY_COMPARISON_OPERATORS_MAPPING = { - "$eq": "==", - "$ne": "!=", - "$gt": ">", - "$gte": ">=", - "$lt": "<", - "$lte": "<=", - "$in": "in", - "$nin": "not in", -} - -ALL_LEGACY_OPERATORS_MAPPING = {**LEGACY_LOGICAL_OPERATORS_MAPPING, **LEGACY_COMPARISON_OPERATORS_MAPPING} diff --git a/releasenotes/notes/remove-legacy-filters-ff89cd0a00a64ce9.yaml b/releasenotes/notes/remove-legacy-filters-ff89cd0a00a64ce9.yaml new file mode 100644 index 000000000..da33cd069 --- /dev/null +++ b/releasenotes/notes/remove-legacy-filters-ff89cd0a00a64ce9.yaml @@ -0,0 +1,4 @@ +--- +upgrade: + - | + The legacy filter syntax support has been completely removed. Users need to use the new filter syntax. See the [docs](https://docs.haystack.deepset.ai/docs/metadata-filtering) for more details. diff --git a/test/document_stores/test_in_memory.py b/test/document_stores/test_in_memory.py index 8b8ed0e5f..ba623eedb 100644 --- a/test/document_stores/test_in_memory.py +++ b/test/document_stores/test_in_memory.py @@ -274,36 +274,6 @@ class TestMemoryDocumentStore(DocumentStoreBaseTests): # pylint: disable=R0904 results = document_store.bm25_retrieval(query="doesn't matter, top_k is 10", top_k=10) assert len(results) == 0 - def test_bm25_retrieval_with_filters(self, document_store: InMemoryDocumentStore): - selected_document = Document(content="Java is, well...", meta={"selected": True}) - docs = [Document(), selected_document, Document(content="Bird watching")] - document_store.write_documents(docs) - results = document_store.bm25_retrieval(query="Java", top_k=10, filters={"selected": True}) - assert len(results) == 1 - assert results[0].id == selected_document.id - - def test_bm25_retrieval_with_filters_keeps_default_filters(self, document_store: InMemoryDocumentStore): - docs = [Document(meta={"selected": True}), Document(content="Gardening"), Document(content="Bird watching")] - document_store.write_documents(docs) - results = document_store.bm25_retrieval(query="Java", top_k=10, filters={"selected": True}) - assert len(results) == 0 - - def test_bm25_retrieval_with_filters_on_text_or_dataframe(self, document_store: InMemoryDocumentStore): - document = Document(dataframe=pd.DataFrame({"language": ["Python", "Java"], "use": ["Data Science", "Web"]})) - docs = [Document(), Document(content="Gardening"), Document(content="Bird watching"), document] - document_store.write_documents(docs) - results = document_store.bm25_retrieval(query="Java", top_k=10, filters={"content": None}) - assert len(results) == 1 - assert results[0].id == document.id - - def test_bm25_retrieval_with_documents_with_mixed_content(self, document_store: InMemoryDocumentStore): - double_document = Document(content="Gardening is a hobby", embedding=[1.0, 2.0, 3.0]) - docs = [Document(embedding=[1.0, 2.0, 3.0]), double_document, Document(content="Bird watching")] - document_store.write_documents(docs) - results = document_store.bm25_retrieval(query="Gardening", top_k=10, filters={"embedding": {"$not": None}}) - assert len(results) == 1 - assert results[0].id == double_document.id - def test_embedding_retrieval(self): docstore = InMemoryDocumentStore(embedding_similarity_function="cosine") # Tests if the embedding retrieval method returns the correct document based on the input query embedding. diff --git a/test/utils/test_filters.py b/test/utils/test_filters.py index eccf1ca1e..708e50cb7 100644 --- a/test/utils/test_filters.py +++ b/test/utils/test_filters.py @@ -6,7 +6,7 @@ import pandas as pd from haystack import Document from haystack.errors import FilterError -from haystack.utils.filters import convert, document_matches_filter +from haystack.utils.filters import document_matches_filter document_matches_filter_data = [ # == operator params @@ -708,21 +708,3 @@ filters_data = [ id="Explicit $not with implicit $eq", ), ] - - -@pytest.mark.parametrize("old_style, new_style", filters_data) -def test_convert(old_style, new_style): - assert convert(old_style) == new_style - - -def test_convert_with_incorrect_input_type(): - with pytest.raises(ValueError): - convert("some string") - - -def test_convert_with_incorrect_filter_nesting(): - with pytest.raises(FilterError): - convert({"number": {"page": "100"}}) - - with pytest.raises(FilterError): - convert({"number": {"page": {"chapter": "intro"}}})