mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 04:56:45 +00:00
fix: Fix filters to handle date times with timezones (loading and comparison) (#8800)
* Fix on date time parsing with timezones. And comparing naive and aware date times. * Add release note * Add more filter tests
This commit is contained in:
parent
ad5d29d92f
commit
1ee86b5041
@ -76,6 +76,11 @@ class MetadataRouter:
|
||||
```
|
||||
"""
|
||||
self.rules = rules
|
||||
for rule in self.rules.values():
|
||||
if "operator" not in rule:
|
||||
raise ValueError(
|
||||
"Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
||||
)
|
||||
component.set_output_types(self, unmatched=List[Document], **{edge: List[Document] for edge in rules})
|
||||
|
||||
def run(self, documents: List[Document]):
|
||||
@ -95,11 +100,6 @@ class MetadataRouter:
|
||||
for document in documents:
|
||||
cur_document_matched = False
|
||||
for edge, rule in self.rules.items():
|
||||
if "operator" not in rule:
|
||||
raise ValueError(
|
||||
"Invalid filter syntax. "
|
||||
"See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
|
||||
)
|
||||
if document_matches_filter(rule, document):
|
||||
output[edge].append(document)
|
||||
cur_document_matched = True
|
||||
|
||||
@ -6,6 +6,7 @@ from dataclasses import fields
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import dateutil.parser
|
||||
import pandas as pd
|
||||
|
||||
from haystack.dataclasses import Document
|
||||
@ -69,18 +70,48 @@ def _greater_than(document_value: Any, filter_value: Any) -> bool:
|
||||
|
||||
if isinstance(document_value, str) or isinstance(filter_value, str):
|
||||
try:
|
||||
document_value = datetime.fromisoformat(document_value)
|
||||
filter_value = datetime.fromisoformat(filter_value)
|
||||
document_value = _parse_date(document_value)
|
||||
filter_value = _parse_date(filter_value)
|
||||
document_value, filter_value = _ensure_both_dates_naive_or_aware(document_value, filter_value)
|
||||
except FilterError as exc:
|
||||
raise exc
|
||||
if type(filter_value) in [list, pd.DataFrame]:
|
||||
msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='"
|
||||
raise FilterError(msg)
|
||||
return document_value > filter_value
|
||||
|
||||
|
||||
def _parse_date(value):
|
||||
"""Try parsing the value as an ISO format date, then fall back to dateutil.parser."""
|
||||
try:
|
||||
return datetime.fromisoformat(value)
|
||||
except (ValueError, TypeError):
|
||||
try:
|
||||
return dateutil.parser.parse(value)
|
||||
except (ValueError, TypeError) as exc:
|
||||
msg = (
|
||||
"Can't compare strings using operators '>', '>=', '<', '<='. "
|
||||
"Strings are only comparable if they are ISO formatted dates."
|
||||
)
|
||||
raise FilterError(msg) from exc
|
||||
if type(filter_value) in [list, pd.DataFrame]:
|
||||
msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='"
|
||||
raise FilterError(msg)
|
||||
return document_value > filter_value
|
||||
|
||||
|
||||
def _ensure_both_dates_naive_or_aware(date1: datetime, date2: datetime):
|
||||
"""Ensure that both dates are either naive or aware."""
|
||||
# Both naive
|
||||
if date1.tzinfo is None and date2.tzinfo is None:
|
||||
return date1, date2
|
||||
|
||||
# Both aware
|
||||
if date1.tzinfo is not None and date2.tzinfo is not None:
|
||||
return date1, date2
|
||||
|
||||
# One naive, one aware
|
||||
if date1.tzinfo is None:
|
||||
date1 = date1.replace(tzinfo=date2.tzinfo)
|
||||
else:
|
||||
date2 = date2.replace(tzinfo=date1.tzinfo)
|
||||
return date1, date2
|
||||
|
||||
|
||||
def _greater_than_equal(document_value: Any, filter_value: Any) -> bool:
|
||||
|
||||
@ -0,0 +1,6 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Enhancements to Date Filtering in MetadataRouter
|
||||
- Improved date parsing in filter utilities by introducing `_parse_date`, which first attempts `datetime.fromisoformat(value)` for backward compatibility and then falls back to dateutil.parser.parse() for broader ISO 8601 support.
|
||||
- Resolved a common issue where comparing naive and timezone-aware datetimes resulted in TypeError. Added `_ensure_both_dates_naive_or_aware`, which ensures both datetimes are either naive or aware. If one is missing a timezone, it is assigned the timezone of the other for consistency.
|
||||
@ -35,3 +35,30 @@ class TestMetadataRouter:
|
||||
assert output["edge_1"][0].meta["created_at"] == "2023-02-01"
|
||||
assert output["edge_2"][0].meta["created_at"] == "2023-05-01"
|
||||
assert output["unmatched"][0].meta["created_at"] == "2023-08-01"
|
||||
|
||||
def test_run_wrong_filter(self):
|
||||
rules = {
|
||||
"edge_1": {"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"},
|
||||
"wrong_filter": {"wrong_value": "meta.created_at == 2023-04-01"},
|
||||
}
|
||||
with pytest.raises(ValueError):
|
||||
MetadataRouter(rules=rules)
|
||||
|
||||
def test_run_datetime_with_timezone(self):
|
||||
rules = {
|
||||
"edge_1": {
|
||||
"operator": "AND",
|
||||
"conditions": [{"field": "meta.created_at", "operator": ">=", "value": "2025-02-01"}],
|
||||
}
|
||||
}
|
||||
router = MetadataRouter(rules=rules)
|
||||
documents = [
|
||||
Document(meta={"created_at": "2025-02-03T12:45:46.435816Z"}),
|
||||
Document(meta={"created_at": "2025-02-01T12:45:46.435816Z"}),
|
||||
Document(meta={"created_at": "2025-01-03T12:45:46.435816Z"}),
|
||||
]
|
||||
output = router.run(documents=documents)
|
||||
assert len(output["edge_1"]) == 2
|
||||
assert output["edge_1"][0].meta["created_at"] == "2025-02-03T12:45:46.435816Z"
|
||||
assert output["edge_1"][1].meta["created_at"] == "2025-02-01T12:45:46.435816Z"
|
||||
assert output["unmatched"][0].meta["created_at"] == "2025-01-03T12:45:46.435816Z"
|
||||
|
||||
@ -485,6 +485,18 @@ document_matches_filter_data = [
|
||||
True,
|
||||
id="NOT operator with Document matching no condition",
|
||||
),
|
||||
pytest.param(
|
||||
{"field": "meta.date", "operator": "==", "value": "2025-02-03T12:45:46.435816Z"},
|
||||
Document(meta={"date": "2025-02-03T12:45:46.435816Z"}),
|
||||
True,
|
||||
id="== operator with ISO 8601 datetime Document value",
|
||||
),
|
||||
pytest.param(
|
||||
{"field": "meta.date", "operator": ">=", "value": "2025-02-01"},
|
||||
Document(meta={"date": "2025-02-03T12:45:46.435816Z"}),
|
||||
True,
|
||||
id=">= operator with naive and aware ISO 8601 datetime Document value",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@ -552,159 +564,3 @@ def test_document_matches_filter_raises_error(filter):
|
||||
with pytest.raises(FilterError):
|
||||
document = Document(meta={"page": 10})
|
||||
document_matches_filter(filter, document)
|
||||
|
||||
|
||||
filters_data = [
|
||||
pytest.param(
|
||||
{
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}},
|
||||
}
|
||||
},
|
||||
{
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "type", "operator": "==", "value": "article"},
|
||||
{"field": "date", "operator": ">=", "value": "2015-01-01"},
|
||||
{"field": "date", "operator": "<", "value": "2021-01-01"},
|
||||
{"field": "rating", "operator": ">=", "value": 3},
|
||||
{
|
||||
"operator": "OR",
|
||||
"conditions": [
|
||||
{"field": "genre", "operator": "in", "value": ["economy", "politics"]},
|
||||
{"field": "publisher", "operator": "==", "value": "nytimes"},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
id="All operators explicit",
|
||||
),
|
||||
pytest.param(
|
||||
{
|
||||
"type": "article",
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {"genre": ["economy", "politics"], "publisher": "nytimes"},
|
||||
},
|
||||
{
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "type", "operator": "==", "value": "article"},
|
||||
{"field": "date", "operator": ">=", "value": "2015-01-01"},
|
||||
{"field": "date", "operator": "<", "value": "2021-01-01"},
|
||||
{"field": "rating", "operator": ">=", "value": 3},
|
||||
{
|
||||
"operator": "OR",
|
||||
"conditions": [
|
||||
{"field": "genre", "operator": "in", "value": ["economy", "politics"]},
|
||||
{"field": "publisher", "operator": "==", "value": "nytimes"},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
id="Root $and implicit",
|
||||
),
|
||||
pytest.param(
|
||||
{
|
||||
"$or": [
|
||||
{"Type": "News Paper", "Date": {"$lt": "2019-01-01"}},
|
||||
{"Type": "Blog Post", "Date": {"$gte": "2019-01-01"}},
|
||||
]
|
||||
},
|
||||
{
|
||||
"operator": "OR",
|
||||
"conditions": [
|
||||
{
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "Type", "operator": "==", "value": "News Paper"},
|
||||
{"field": "Date", "operator": "<", "value": "2019-01-01"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "Type", "operator": "==", "value": "Blog Post"},
|
||||
{"field": "Date", "operator": ">=", "value": "2019-01-01"},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
id="Root $or with list and multiple comparisons",
|
||||
),
|
||||
pytest.param(
|
||||
{"text": "A Foo Document 1"},
|
||||
{"operator": "AND", "conditions": [{"field": "text", "operator": "==", "value": "A Foo Document 1"}]},
|
||||
id="Implicit root $and and field $eq",
|
||||
),
|
||||
pytest.param(
|
||||
{"$or": {"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}, "number": {"$lt": 1.0}}},
|
||||
{
|
||||
"operator": "OR",
|
||||
"conditions": [
|
||||
{
|
||||
"operator": "OR",
|
||||
"conditions": [
|
||||
{"field": "name", "operator": "==", "value": "name_0"},
|
||||
{"field": "name", "operator": "==", "value": "name_1"},
|
||||
],
|
||||
},
|
||||
{"field": "number", "operator": "<", "value": 1.0},
|
||||
],
|
||||
},
|
||||
id="Root $or with dict and field $or with list",
|
||||
),
|
||||
pytest.param(
|
||||
{"number": {"$lte": 2, "$gte": 0}, "name": ["name_0", "name_1"]},
|
||||
{
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "number", "operator": "<=", "value": 2},
|
||||
{"field": "number", "operator": ">=", "value": 0},
|
||||
{"field": "name", "operator": "in", "value": ["name_0", "name_1"]},
|
||||
],
|
||||
},
|
||||
id="Implicit $and and field $in",
|
||||
),
|
||||
pytest.param(
|
||||
{"number": {"$and": [{"$lte": 2}, {"$gte": 0}]}},
|
||||
{
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "number", "operator": "<=", "value": 2},
|
||||
{"field": "number", "operator": ">=", "value": 0},
|
||||
],
|
||||
},
|
||||
id="Implicit root $and and field $and with list",
|
||||
),
|
||||
pytest.param(
|
||||
{
|
||||
"$not": {
|
||||
"number": {"$lt": 1.0},
|
||||
"$and": {"name": {"$in": ["name_0", "name_1"]}, "$not": {"chapter": {"$eq": "intro"}}},
|
||||
}
|
||||
},
|
||||
{
|
||||
"operator": "NOT",
|
||||
"conditions": [
|
||||
{"field": "number", "operator": "<", "value": 1.0},
|
||||
{
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "name", "operator": "in", "value": ["name_0", "name_1"]},
|
||||
{"operator": "NOT", "conditions": [{"field": "chapter", "operator": "==", "value": "intro"}]},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
id="Root explicit $not",
|
||||
),
|
||||
pytest.param(
|
||||
{"page": {"$not": 102}},
|
||||
{"operator": "NOT", "conditions": [{"field": "page", "operator": "==", "value": 102}]},
|
||||
id="Explicit $not with implicit $eq",
|
||||
),
|
||||
]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user