mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-07 20:46:31 +00:00
chore: Remove all references to old filter syntax (#8342)
* Remove all references to old filter syntax * More removals * Lint * Do not remove test_filter_retriever.py * Add reno note * Update ValueError text to match text in haystack-core-integrations
This commit is contained in:
parent
672bcf7e03
commit
7e9f153e78
@ -5,7 +5,7 @@
|
||||
from typing import Dict, List
|
||||
|
||||
from haystack import Document, component
|
||||
from haystack.utils.filters import convert, document_matches_filter
|
||||
from haystack.utils.filters import document_matches_filter
|
||||
|
||||
|
||||
@component
|
||||
@ -96,8 +96,10 @@ class MetadataRouter:
|
||||
cur_document_matched = False
|
||||
for edge, rule in self.rules.items():
|
||||
if "operator" not in rule:
|
||||
# Must be a legacy filter, convert it
|
||||
rule = convert(rule)
|
||||
raise ValueError(
|
||||
"Invalid filter syntax. "
|
||||
"See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
||||
)
|
||||
if document_matches_filter(rule, document):
|
||||
output[edge].append(document)
|
||||
cur_document_matched = True
|
||||
|
||||
@ -18,7 +18,7 @@ from haystack.dataclasses import Document
|
||||
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
|
||||
from haystack.document_stores.types import DuplicatePolicy
|
||||
from haystack.utils import expit
|
||||
from haystack.utils.filters import convert, document_matches_filter
|
||||
from haystack.utils.filters import document_matches_filter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -395,7 +395,10 @@ class InMemoryDocumentStore:
|
||||
"""
|
||||
if filters:
|
||||
if "operator" not in filters and "conditions" not in filters:
|
||||
filters = convert(filters)
|
||||
raise ValueError(
|
||||
"Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering "
|
||||
"for details."
|
||||
)
|
||||
return [doc for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)]
|
||||
return list(self.storage.values())
|
||||
|
||||
@ -502,7 +505,10 @@ class InMemoryDocumentStore:
|
||||
}
|
||||
if filters:
|
||||
if "operator" not in filters:
|
||||
filters = convert(filters)
|
||||
raise ValueError(
|
||||
"Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering "
|
||||
"for details."
|
||||
)
|
||||
filters = {"operator": "AND", "conditions": [content_type_filter, filters]}
|
||||
else:
|
||||
filters = content_type_filter
|
||||
|
||||
@ -2,10 +2,9 @@
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import warnings
|
||||
from dataclasses import fields
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Union
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
@ -177,145 +176,3 @@ def _comparison_condition(condition: Dict[str, Any], document: Document) -> bool
|
||||
operator: str = condition["operator"]
|
||||
filter_value: Any = condition["value"]
|
||||
return COMPARISON_OPERATORS[operator](filter_value=filter_value, document_value=document_value)
|
||||
|
||||
|
||||
def convert(filters: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a filter declared using the legacy style into the new style.
|
||||
|
||||
This is mostly meant to ease migration from Haystack 1.x to 2.x for developers
|
||||
of Document Stores and Components that use filters.
|
||||
|
||||
This function doesn't verify if `filters` are declared using the legacy style.
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
legacy_filter = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}},
|
||||
}
|
||||
}
|
||||
assert convert(legacy_filter) == {
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "type", "operator": "==", "value": "article"},
|
||||
{"field": "date", "operator": ">=", "value": "2015-01-01"},
|
||||
{"field": "date", "operator": "<", "value": "2021-01-01"},
|
||||
{"field": "rating", "operator": ">=", "value": 3},
|
||||
{
|
||||
"operator": "OR",
|
||||
"conditions": [
|
||||
{"field": "genre", "operator": "in", "value": ["economy", "politics"]},
|
||||
{"field": "publisher", "operator": "==", "value": "nytimes"},
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
||||
```
|
||||
"""
|
||||
warnings.warn(
|
||||
"The use of legacy (Haystack 1.x) filters is deprecated and will be removed in the future. "
|
||||
"Please use the new filter style as described in the documentation - "
|
||||
"https://docs.haystack.deepset.ai/docs/metadata-filtering",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
if not isinstance(filters, dict):
|
||||
msg = f"Can't convert filters from type '{type(filters)}'"
|
||||
raise ValueError(msg)
|
||||
|
||||
converted = _internal_convert(filters)
|
||||
if "conditions" not in converted:
|
||||
# This is done to handle a corner case when filter is really simple like so:
|
||||
# {"text": "A Foo Document 1"}
|
||||
# The root '$and' operator is implicit so the conversion doesn't handle
|
||||
# it and it must be added explicitly like so.
|
||||
# This only happens for simple filters like the one above.
|
||||
return {"operator": "AND", "conditions": [converted]}
|
||||
return converted
|
||||
|
||||
|
||||
def _internal_convert(filters: Union[List[Any], Dict[str, Any]], previous_key=None) -> Any:
|
||||
"""
|
||||
Recursively convert filters from legacy to new style.
|
||||
"""
|
||||
conditions = []
|
||||
|
||||
if isinstance(filters, list) and (result := _handle_list(filters, previous_key)) is not None:
|
||||
return result
|
||||
|
||||
if not isinstance(filters, dict):
|
||||
return _handle_non_dict(filters, previous_key)
|
||||
|
||||
for key, value in filters.items():
|
||||
if (
|
||||
previous_key is not None
|
||||
and previous_key not in ALL_LEGACY_OPERATORS_MAPPING
|
||||
and key not in ALL_LEGACY_OPERATORS_MAPPING
|
||||
):
|
||||
msg = f"This filter ({filters}) seems to be malformed."
|
||||
raise FilterError(msg)
|
||||
if key not in ALL_LEGACY_OPERATORS_MAPPING:
|
||||
converted = _internal_convert(value, previous_key=key)
|
||||
if isinstance(converted, list):
|
||||
conditions.extend(converted)
|
||||
else:
|
||||
conditions.append(converted)
|
||||
elif key in LEGACY_LOGICAL_OPERATORS_MAPPING:
|
||||
if previous_key not in ALL_LEGACY_OPERATORS_MAPPING and isinstance(value, list):
|
||||
converted = [_internal_convert({previous_key: v}) for v in value]
|
||||
conditions.append({"operator": ALL_LEGACY_OPERATORS_MAPPING[key], "conditions": converted})
|
||||
else:
|
||||
converted = _internal_convert(value, previous_key=key)
|
||||
if key == "$not" and type(converted) not in [dict, list]:
|
||||
# This handles a corner when '$not' is used like this:
|
||||
# '{"page": {"$not": 102}}'
|
||||
# Without this check we would miss the implicit '$eq'
|
||||
converted = {"field": previous_key, "operator": "==", "value": value}
|
||||
if not isinstance(converted, list):
|
||||
converted = [converted]
|
||||
conditions.append({"operator": ALL_LEGACY_OPERATORS_MAPPING[key], "conditions": converted})
|
||||
elif key in LEGACY_COMPARISON_OPERATORS_MAPPING:
|
||||
conditions.append({"field": previous_key, "operator": ALL_LEGACY_OPERATORS_MAPPING[key], "value": value})
|
||||
|
||||
if len(conditions) == 1:
|
||||
return conditions[0]
|
||||
|
||||
if previous_key is None:
|
||||
return {"operator": "AND", "conditions": conditions}
|
||||
|
||||
return conditions
|
||||
|
||||
|
||||
def _handle_list(filters, previous_key):
|
||||
if previous_key in LEGACY_LOGICAL_OPERATORS_MAPPING:
|
||||
return [_internal_convert(f) for f in filters]
|
||||
elif previous_key not in LEGACY_COMPARISON_OPERATORS_MAPPING:
|
||||
return {"field": previous_key, "operator": "in", "value": filters}
|
||||
return None
|
||||
|
||||
|
||||
def _handle_non_dict(filters, previous_key):
|
||||
if previous_key not in ALL_LEGACY_OPERATORS_MAPPING:
|
||||
return {"field": previous_key, "operator": "==", "value": filters}
|
||||
return filters
|
||||
|
||||
|
||||
# Operator mappings from legacy style to new one
|
||||
LEGACY_LOGICAL_OPERATORS_MAPPING = {"$and": "AND", "$or": "OR", "$not": "NOT"}
|
||||
|
||||
LEGACY_COMPARISON_OPERATORS_MAPPING = {
|
||||
"$eq": "==",
|
||||
"$ne": "!=",
|
||||
"$gt": ">",
|
||||
"$gte": ">=",
|
||||
"$lt": "<",
|
||||
"$lte": "<=",
|
||||
"$in": "in",
|
||||
"$nin": "not in",
|
||||
}
|
||||
|
||||
ALL_LEGACY_OPERATORS_MAPPING = {**LEGACY_LOGICAL_OPERATORS_MAPPING, **LEGACY_COMPARISON_OPERATORS_MAPPING}
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
upgrade:
|
||||
- |
|
||||
The legacy filter syntax support has been completely removed. Users need to use the new filter syntax. See the [docs](https://docs.haystack.deepset.ai/docs/metadata-filtering) for more details.
|
||||
@ -274,36 +274,6 @@ class TestMemoryDocumentStore(DocumentStoreBaseTests): # pylint: disable=R0904
|
||||
results = document_store.bm25_retrieval(query="doesn't matter, top_k is 10", top_k=10)
|
||||
assert len(results) == 0
|
||||
|
||||
def test_bm25_retrieval_with_filters(self, document_store: InMemoryDocumentStore):
|
||||
selected_document = Document(content="Java is, well...", meta={"selected": True})
|
||||
docs = [Document(), selected_document, Document(content="Bird watching")]
|
||||
document_store.write_documents(docs)
|
||||
results = document_store.bm25_retrieval(query="Java", top_k=10, filters={"selected": True})
|
||||
assert len(results) == 1
|
||||
assert results[0].id == selected_document.id
|
||||
|
||||
def test_bm25_retrieval_with_filters_keeps_default_filters(self, document_store: InMemoryDocumentStore):
|
||||
docs = [Document(meta={"selected": True}), Document(content="Gardening"), Document(content="Bird watching")]
|
||||
document_store.write_documents(docs)
|
||||
results = document_store.bm25_retrieval(query="Java", top_k=10, filters={"selected": True})
|
||||
assert len(results) == 0
|
||||
|
||||
def test_bm25_retrieval_with_filters_on_text_or_dataframe(self, document_store: InMemoryDocumentStore):
|
||||
document = Document(dataframe=pd.DataFrame({"language": ["Python", "Java"], "use": ["Data Science", "Web"]}))
|
||||
docs = [Document(), Document(content="Gardening"), Document(content="Bird watching"), document]
|
||||
document_store.write_documents(docs)
|
||||
results = document_store.bm25_retrieval(query="Java", top_k=10, filters={"content": None})
|
||||
assert len(results) == 1
|
||||
assert results[0].id == document.id
|
||||
|
||||
def test_bm25_retrieval_with_documents_with_mixed_content(self, document_store: InMemoryDocumentStore):
|
||||
double_document = Document(content="Gardening is a hobby", embedding=[1.0, 2.0, 3.0])
|
||||
docs = [Document(embedding=[1.0, 2.0, 3.0]), double_document, Document(content="Bird watching")]
|
||||
document_store.write_documents(docs)
|
||||
results = document_store.bm25_retrieval(query="Gardening", top_k=10, filters={"embedding": {"$not": None}})
|
||||
assert len(results) == 1
|
||||
assert results[0].id == double_document.id
|
||||
|
||||
def test_embedding_retrieval(self):
|
||||
docstore = InMemoryDocumentStore(embedding_similarity_function="cosine")
|
||||
# Tests if the embedding retrieval method returns the correct document based on the input query embedding.
|
||||
|
||||
@ -6,7 +6,7 @@ import pandas as pd
|
||||
|
||||
from haystack import Document
|
||||
from haystack.errors import FilterError
|
||||
from haystack.utils.filters import convert, document_matches_filter
|
||||
from haystack.utils.filters import document_matches_filter
|
||||
|
||||
document_matches_filter_data = [
|
||||
# == operator params
|
||||
@ -708,21 +708,3 @@ filters_data = [
|
||||
id="Explicit $not with implicit $eq",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("old_style, new_style", filters_data)
|
||||
def test_convert(old_style, new_style):
|
||||
assert convert(old_style) == new_style
|
||||
|
||||
|
||||
def test_convert_with_incorrect_input_type():
|
||||
with pytest.raises(ValueError):
|
||||
convert("some string")
|
||||
|
||||
|
||||
def test_convert_with_incorrect_filter_nesting():
|
||||
with pytest.raises(FilterError):
|
||||
convert({"number": {"page": "100"}})
|
||||
|
||||
with pytest.raises(FilterError):
|
||||
convert({"number": {"page": {"chapter": "intro"}}})
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user