diff --git a/haystack/components/routers/metadata_router.py b/haystack/components/routers/metadata_router.py index 7c6c007d1..f0604d912 100644 --- a/haystack/components/routers/metadata_router.py +++ b/haystack/components/routers/metadata_router.py @@ -76,6 +76,11 @@ class MetadataRouter: ``` """ self.rules = rules + for rule in self.rules.values(): + if "operator" not in rule: + raise ValueError( + "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details." + ) component.set_output_types(self, unmatched=List[Document], **{edge: List[Document] for edge in rules}) def run(self, documents: List[Document]): @@ -95,11 +100,6 @@ class MetadataRouter: for document in documents: cur_document_matched = False for edge, rule in self.rules.items(): - if "operator" not in rule: - raise ValueError( - "Invalid filter syntax. " - "See https://docs.haystack.deepset.ai/docs/metadata-filtering for details." - ) if document_matches_filter(rule, document): output[edge].append(document) cur_document_matched = True diff --git a/haystack/utils/filters.py b/haystack/utils/filters.py index bddb422ef..d9178b62d 100644 --- a/haystack/utils/filters.py +++ b/haystack/utils/filters.py @@ -6,6 +6,7 @@ from dataclasses import fields from datetime import datetime from typing import Any, Dict, List, Optional +import dateutil.parser import pandas as pd from haystack.dataclasses import Document @@ -69,18 +70,48 @@ def _greater_than(document_value: Any, filter_value: Any) -> bool: if isinstance(document_value, str) or isinstance(filter_value, str): try: - document_value = datetime.fromisoformat(document_value) - filter_value = datetime.fromisoformat(filter_value) + document_value = _parse_date(document_value) + filter_value = _parse_date(filter_value) + document_value, filter_value = _ensure_both_dates_naive_or_aware(document_value, filter_value) + except FilterError as exc: + raise exc + if type(filter_value) in [list, pd.DataFrame]: + msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return document_value > filter_value + + +def _parse_date(value): + """Try parsing the value as an ISO format date, then fall back to dateutil.parser.""" + try: + return datetime.fromisoformat(value) + except (ValueError, TypeError): + try: + return dateutil.parser.parse(value) except (ValueError, TypeError) as exc: msg = ( "Can't compare strings using operators '>', '>=', '<', '<='. " "Strings are only comparable if they are ISO formatted dates." ) raise FilterError(msg) from exc - if type(filter_value) in [list, pd.DataFrame]: - msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='" - raise FilterError(msg) - return document_value > filter_value + + +def _ensure_both_dates_naive_or_aware(date1: datetime, date2: datetime): + """Ensure that both dates are either naive or aware.""" + # Both naive + if date1.tzinfo is None and date2.tzinfo is None: + return date1, date2 + + # Both aware + if date1.tzinfo is not None and date2.tzinfo is not None: + return date1, date2 + + # One naive, one aware + if date1.tzinfo is None: + date1 = date1.replace(tzinfo=date2.tzinfo) + else: + date2 = date2.replace(tzinfo=date1.tzinfo) + return date1, date2 def _greater_than_equal(document_value: Any, filter_value: Any) -> bool: diff --git a/releasenotes/notes/fix-date-comparison-ced1d6ef64534951.yaml b/releasenotes/notes/fix-date-comparison-ced1d6ef64534951.yaml new file mode 100644 index 000000000..b6e071ecf --- /dev/null +++ b/releasenotes/notes/fix-date-comparison-ced1d6ef64534951.yaml @@ -0,0 +1,6 @@ +--- +enhancements: + - | + Enhancements to Date Filtering in MetadataRouter + - Improved date parsing in filter utilities by introducing `_parse_date`, which first attempts `datetime.fromisoformat(value)` for backward compatibility and then falls back to dateutil.parser.parse() for broader ISO 8601 support. + - Resolved a common issue where comparing naive and timezone-aware datetimes resulted in TypeError. Added `_ensure_both_dates_naive_or_aware`, which ensures both datetimes are either naive or aware. If one is missing a timezone, it is assigned the timezone of the other for consistency. diff --git a/test/components/routers/test_metadata_router.py b/test/components/routers/test_metadata_router.py index d9e8ce987..2b7e96470 100644 --- a/test/components/routers/test_metadata_router.py +++ b/test/components/routers/test_metadata_router.py @@ -35,3 +35,30 @@ class TestMetadataRouter: assert output["edge_1"][0].meta["created_at"] == "2023-02-01" assert output["edge_2"][0].meta["created_at"] == "2023-05-01" assert output["unmatched"][0].meta["created_at"] == "2023-08-01" + + def test_run_wrong_filter(self): + rules = { + "edge_1": {"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"}, + "wrong_filter": {"wrong_value": "meta.created_at == 2023-04-01"}, + } + with pytest.raises(ValueError): + MetadataRouter(rules=rules) + + def test_run_datetime_with_timezone(self): + rules = { + "edge_1": { + "operator": "AND", + "conditions": [{"field": "meta.created_at", "operator": ">=", "value": "2025-02-01"}], + } + } + router = MetadataRouter(rules=rules) + documents = [ + Document(meta={"created_at": "2025-02-03T12:45:46.435816Z"}), + Document(meta={"created_at": "2025-02-01T12:45:46.435816Z"}), + Document(meta={"created_at": "2025-01-03T12:45:46.435816Z"}), + ] + output = router.run(documents=documents) + assert len(output["edge_1"]) == 2 + assert output["edge_1"][0].meta["created_at"] == "2025-02-03T12:45:46.435816Z" + assert output["edge_1"][1].meta["created_at"] == "2025-02-01T12:45:46.435816Z" + assert output["unmatched"][0].meta["created_at"] == "2025-01-03T12:45:46.435816Z" diff --git a/test/utils/test_filters.py b/test/utils/test_filters.py index 708e50cb7..7c3da7cc4 100644 --- a/test/utils/test_filters.py +++ b/test/utils/test_filters.py @@ -485,6 +485,18 @@ document_matches_filter_data = [ True, id="NOT operator with Document matching no condition", ), + pytest.param( + {"field": "meta.date", "operator": "==", "value": "2025-02-03T12:45:46.435816Z"}, + Document(meta={"date": "2025-02-03T12:45:46.435816Z"}), + True, + id="== operator with ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.date", "operator": ">=", "value": "2025-02-01"}, + Document(meta={"date": "2025-02-03T12:45:46.435816Z"}), + True, + id=">= operator with naive and aware ISO 8601 datetime Document value", + ), ] @@ -552,159 +564,3 @@ def test_document_matches_filter_raises_error(filter): with pytest.raises(FilterError): document = Document(meta={"page": 10}) document_matches_filter(filter, document) - - -filters_data = [ - pytest.param( - { - "$and": { - "type": {"$eq": "article"}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}}, - } - }, - { - "operator": "AND", - "conditions": [ - {"field": "type", "operator": "==", "value": "article"}, - {"field": "date", "operator": ">=", "value": "2015-01-01"}, - {"field": "date", "operator": "<", "value": "2021-01-01"}, - {"field": "rating", "operator": ">=", "value": 3}, - { - "operator": "OR", - "conditions": [ - {"field": "genre", "operator": "in", "value": ["economy", "politics"]}, - {"field": "publisher", "operator": "==", "value": "nytimes"}, - ], - }, - ], - }, - id="All operators explicit", - ), - pytest.param( - { - "type": "article", - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": {"genre": ["economy", "politics"], "publisher": "nytimes"}, - }, - { - "operator": "AND", - "conditions": [ - {"field": "type", "operator": "==", "value": "article"}, - {"field": "date", "operator": ">=", "value": "2015-01-01"}, - {"field": "date", "operator": "<", "value": "2021-01-01"}, - {"field": "rating", "operator": ">=", "value": 3}, - { - "operator": "OR", - "conditions": [ - {"field": "genre", "operator": "in", "value": ["economy", "politics"]}, - {"field": "publisher", "operator": "==", "value": "nytimes"}, - ], - }, - ], - }, - id="Root $and implicit", - ), - pytest.param( - { - "$or": [ - {"Type": "News Paper", "Date": {"$lt": "2019-01-01"}}, - {"Type": "Blog Post", "Date": {"$gte": "2019-01-01"}}, - ] - }, - { - "operator": "OR", - "conditions": [ - { - "operator": "AND", - "conditions": [ - {"field": "Type", "operator": "==", "value": "News Paper"}, - {"field": "Date", "operator": "<", "value": "2019-01-01"}, - ], - }, - { - "operator": "AND", - "conditions": [ - {"field": "Type", "operator": "==", "value": "Blog Post"}, - {"field": "Date", "operator": ">=", "value": "2019-01-01"}, - ], - }, - ], - }, - id="Root $or with list and multiple comparisons", - ), - pytest.param( - {"text": "A Foo Document 1"}, - {"operator": "AND", "conditions": [{"field": "text", "operator": "==", "value": "A Foo Document 1"}]}, - id="Implicit root $and and field $eq", - ), - pytest.param( - {"$or": {"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}, "number": {"$lt": 1.0}}}, - { - "operator": "OR", - "conditions": [ - { - "operator": "OR", - "conditions": [ - {"field": "name", "operator": "==", "value": "name_0"}, - {"field": "name", "operator": "==", "value": "name_1"}, - ], - }, - {"field": "number", "operator": "<", "value": 1.0}, - ], - }, - id="Root $or with dict and field $or with list", - ), - pytest.param( - {"number": {"$lte": 2, "$gte": 0}, "name": ["name_0", "name_1"]}, - { - "operator": "AND", - "conditions": [ - {"field": "number", "operator": "<=", "value": 2}, - {"field": "number", "operator": ">=", "value": 0}, - {"field": "name", "operator": "in", "value": ["name_0", "name_1"]}, - ], - }, - id="Implicit $and and field $in", - ), - pytest.param( - {"number": {"$and": [{"$lte": 2}, {"$gte": 0}]}}, - { - "operator": "AND", - "conditions": [ - {"field": "number", "operator": "<=", "value": 2}, - {"field": "number", "operator": ">=", "value": 0}, - ], - }, - id="Implicit root $and and field $and with list", - ), - pytest.param( - { - "$not": { - "number": {"$lt": 1.0}, - "$and": {"name": {"$in": ["name_0", "name_1"]}, "$not": {"chapter": {"$eq": "intro"}}}, - } - }, - { - "operator": "NOT", - "conditions": [ - {"field": "number", "operator": "<", "value": 1.0}, - { - "operator": "AND", - "conditions": [ - {"field": "name", "operator": "in", "value": ["name_0", "name_1"]}, - {"operator": "NOT", "conditions": [{"field": "chapter", "operator": "==", "value": "intro"}]}, - ], - }, - ], - }, - id="Root explicit $not", - ), - pytest.param( - {"page": {"$not": 102}}, - {"operator": "NOT", "conditions": [{"field": "page", "operator": "==", "value": 102}]}, - id="Explicit $not with implicit $eq", - ), -]