fix: Fix filters to handle date times with timezones (loading and comparison) (#8800)

* Fix on date time parsing with timezones. And comparing naive and aware date times.

* Add release note

* Add more filter tests
This commit is contained in:
Sebastian Husch Lee 2025-02-04 05:51:06 -08:00 committed by GitHub
parent ad5d29d92f
commit 1ee86b5041
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 87 additions and 167 deletions

View File

@ -76,6 +76,11 @@ class MetadataRouter:
```
"""
self.rules = rules
for rule in self.rules.values():
if "operator" not in rule:
raise ValueError(
"Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
)
component.set_output_types(self, unmatched=List[Document], **{edge: List[Document] for edge in rules})
def run(self, documents: List[Document]):
@ -95,11 +100,6 @@ class MetadataRouter:
for document in documents:
cur_document_matched = False
for edge, rule in self.rules.items():
if "operator" not in rule:
raise ValueError(
"Invalid filter syntax. "
"See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
)
if document_matches_filter(rule, document):
output[edge].append(document)
cur_document_matched = True

View File

@ -6,6 +6,7 @@ from dataclasses import fields
from datetime import datetime
from typing import Any, Dict, List, Optional
import dateutil.parser
import pandas as pd
from haystack.dataclasses import Document
@ -69,18 +70,48 @@ def _greater_than(document_value: Any, filter_value: Any) -> bool:
if isinstance(document_value, str) or isinstance(filter_value, str):
try:
document_value = datetime.fromisoformat(document_value)
filter_value = datetime.fromisoformat(filter_value)
document_value = _parse_date(document_value)
filter_value = _parse_date(filter_value)
document_value, filter_value = _ensure_both_dates_naive_or_aware(document_value, filter_value)
except FilterError as exc:
raise exc
if type(filter_value) in [list, pd.DataFrame]:
msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='"
raise FilterError(msg)
return document_value > filter_value
def _parse_date(value):
"""Try parsing the value as an ISO format date, then fall back to dateutil.parser."""
try:
return datetime.fromisoformat(value)
except (ValueError, TypeError):
try:
return dateutil.parser.parse(value)
except (ValueError, TypeError) as exc:
msg = (
"Can't compare strings using operators '>', '>=', '<', '<='. "
"Strings are only comparable if they are ISO formatted dates."
)
raise FilterError(msg) from exc
if type(filter_value) in [list, pd.DataFrame]:
msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='"
raise FilterError(msg)
return document_value > filter_value
def _ensure_both_dates_naive_or_aware(date1: datetime, date2: datetime):
"""Ensure that both dates are either naive or aware."""
# Both naive
if date1.tzinfo is None and date2.tzinfo is None:
return date1, date2
# Both aware
if date1.tzinfo is not None and date2.tzinfo is not None:
return date1, date2
# One naive, one aware
if date1.tzinfo is None:
date1 = date1.replace(tzinfo=date2.tzinfo)
else:
date2 = date2.replace(tzinfo=date1.tzinfo)
return date1, date2
def _greater_than_equal(document_value: Any, filter_value: Any) -> bool:

View File

@ -0,0 +1,6 @@
---
enhancements:
- |
Enhancements to Date Filtering in MetadataRouter
- Improved date parsing in filter utilities by introducing `_parse_date`, which first attempts `datetime.fromisoformat(value)` for backward compatibility and then falls back to dateutil.parser.parse() for broader ISO 8601 support.
- Resolved a common issue where comparing naive and timezone-aware datetimes resulted in TypeError. Added `_ensure_both_dates_naive_or_aware`, which ensures both datetimes are either naive or aware. If one is missing a timezone, it is assigned the timezone of the other for consistency.

View File

@ -35,3 +35,30 @@ class TestMetadataRouter:
assert output["edge_1"][0].meta["created_at"] == "2023-02-01"
assert output["edge_2"][0].meta["created_at"] == "2023-05-01"
assert output["unmatched"][0].meta["created_at"] == "2023-08-01"
def test_run_wrong_filter(self):
rules = {
"edge_1": {"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"},
"wrong_filter": {"wrong_value": "meta.created_at == 2023-04-01"},
}
with pytest.raises(ValueError):
MetadataRouter(rules=rules)
def test_run_datetime_with_timezone(self):
rules = {
"edge_1": {
"operator": "AND",
"conditions": [{"field": "meta.created_at", "operator": ">=", "value": "2025-02-01"}],
}
}
router = MetadataRouter(rules=rules)
documents = [
Document(meta={"created_at": "2025-02-03T12:45:46.435816Z"}),
Document(meta={"created_at": "2025-02-01T12:45:46.435816Z"}),
Document(meta={"created_at": "2025-01-03T12:45:46.435816Z"}),
]
output = router.run(documents=documents)
assert len(output["edge_1"]) == 2
assert output["edge_1"][0].meta["created_at"] == "2025-02-03T12:45:46.435816Z"
assert output["edge_1"][1].meta["created_at"] == "2025-02-01T12:45:46.435816Z"
assert output["unmatched"][0].meta["created_at"] == "2025-01-03T12:45:46.435816Z"

View File

@ -485,6 +485,18 @@ document_matches_filter_data = [
True,
id="NOT operator with Document matching no condition",
),
pytest.param(
{"field": "meta.date", "operator": "==", "value": "2025-02-03T12:45:46.435816Z"},
Document(meta={"date": "2025-02-03T12:45:46.435816Z"}),
True,
id="== operator with ISO 8601 datetime Document value",
),
pytest.param(
{"field": "meta.date", "operator": ">=", "value": "2025-02-01"},
Document(meta={"date": "2025-02-03T12:45:46.435816Z"}),
True,
id=">= operator with naive and aware ISO 8601 datetime Document value",
),
]
@ -552,159 +564,3 @@ def test_document_matches_filter_raises_error(filter):
with pytest.raises(FilterError):
document = Document(meta={"page": 10})
document_matches_filter(filter, document)
filters_data = [
pytest.param(
{
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}},
}
},
{
"operator": "AND",
"conditions": [
{"field": "type", "operator": "==", "value": "article"},
{"field": "date", "operator": ">=", "value": "2015-01-01"},
{"field": "date", "operator": "<", "value": "2021-01-01"},
{"field": "rating", "operator": ">=", "value": 3},
{
"operator": "OR",
"conditions": [
{"field": "genre", "operator": "in", "value": ["economy", "politics"]},
{"field": "publisher", "operator": "==", "value": "nytimes"},
],
},
],
},
id="All operators explicit",
),
pytest.param(
{
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {"genre": ["economy", "politics"], "publisher": "nytimes"},
},
{
"operator": "AND",
"conditions": [
{"field": "type", "operator": "==", "value": "article"},
{"field": "date", "operator": ">=", "value": "2015-01-01"},
{"field": "date", "operator": "<", "value": "2021-01-01"},
{"field": "rating", "operator": ">=", "value": 3},
{
"operator": "OR",
"conditions": [
{"field": "genre", "operator": "in", "value": ["economy", "politics"]},
{"field": "publisher", "operator": "==", "value": "nytimes"},
],
},
],
},
id="Root $and implicit",
),
pytest.param(
{
"$or": [
{"Type": "News Paper", "Date": {"$lt": "2019-01-01"}},
{"Type": "Blog Post", "Date": {"$gte": "2019-01-01"}},
]
},
{
"operator": "OR",
"conditions": [
{
"operator": "AND",
"conditions": [
{"field": "Type", "operator": "==", "value": "News Paper"},
{"field": "Date", "operator": "<", "value": "2019-01-01"},
],
},
{
"operator": "AND",
"conditions": [
{"field": "Type", "operator": "==", "value": "Blog Post"},
{"field": "Date", "operator": ">=", "value": "2019-01-01"},
],
},
],
},
id="Root $or with list and multiple comparisons",
),
pytest.param(
{"text": "A Foo Document 1"},
{"operator": "AND", "conditions": [{"field": "text", "operator": "==", "value": "A Foo Document 1"}]},
id="Implicit root $and and field $eq",
),
pytest.param(
{"$or": {"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}, "number": {"$lt": 1.0}}},
{
"operator": "OR",
"conditions": [
{
"operator": "OR",
"conditions": [
{"field": "name", "operator": "==", "value": "name_0"},
{"field": "name", "operator": "==", "value": "name_1"},
],
},
{"field": "number", "operator": "<", "value": 1.0},
],
},
id="Root $or with dict and field $or with list",
),
pytest.param(
{"number": {"$lte": 2, "$gte": 0}, "name": ["name_0", "name_1"]},
{
"operator": "AND",
"conditions": [
{"field": "number", "operator": "<=", "value": 2},
{"field": "number", "operator": ">=", "value": 0},
{"field": "name", "operator": "in", "value": ["name_0", "name_1"]},
],
},
id="Implicit $and and field $in",
),
pytest.param(
{"number": {"$and": [{"$lte": 2}, {"$gte": 0}]}},
{
"operator": "AND",
"conditions": [
{"field": "number", "operator": "<=", "value": 2},
{"field": "number", "operator": ">=", "value": 0},
],
},
id="Implicit root $and and field $and with list",
),
pytest.param(
{
"$not": {
"number": {"$lt": 1.0},
"$and": {"name": {"$in": ["name_0", "name_1"]}, "$not": {"chapter": {"$eq": "intro"}}},
}
},
{
"operator": "NOT",
"conditions": [
{"field": "number", "operator": "<", "value": 1.0},
{
"operator": "AND",
"conditions": [
{"field": "name", "operator": "in", "value": ["name_0", "name_1"]},
{"operator": "NOT", "conditions": [{"field": "chapter", "operator": "==", "value": "intro"}]},
],
},
],
},
id="Root explicit $not",
),
pytest.param(
{"page": {"$not": 102}},
{"operator": "NOT", "conditions": [{"field": "page", "operator": "==", "value": 102}]},
id="Explicit $not with implicit $eq",
),
]