refactor: Add support for new filters declaration (#6397)

* Rework filter logic for InMemoryDocumentStore to support new filters
declaration

* Fix legacy filters tests

* Simplify logic and handle dates comparison

* Rework MetadataRouter to support new filters

* Update docstrings

* Add release notes

* Fix linting

* Avoid duplicating filters specifications

* Handle corner case

* Simplify docstring

* Fix filters logic and tests

* Fix Document Store testing legacy filters tests
This commit is contained in:
Silvano Cerza 2023-11-24 11:22:46 +01:00 committed by GitHub
parent 28c2b09d90
commit fd16ec63cb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 883 additions and 890 deletions

View File

@ -18,7 +18,7 @@ def test_preprocessing_pipeline(tmp_path):
preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
preprocessing_pipeline.add_component(
instance=MetadataRouter(rules={"en": {"language": {"$eq": "en"}}}), name="router"
instance=MetadataRouter(rules={"en": {"field": "language", "operator": "==", "value": "en"}}), name="router"
)
preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
preprocessing_pipeline.add_component(

View File

@ -1,7 +1,7 @@
from typing import Dict, List
from haystack.preview import component, Document
from haystack.preview.utils.filters import document_matches_filter
from haystack.preview.utils.filters import document_matches_filter, convert
@component
@ -19,12 +19,36 @@ class MetadataRouter:
follow the format of filtering expressions in Haystack. For example:
```python
{
"edge_1": {"created_at": {"$gte": "2023-01-01", "$lt": "2023-04-01"}},
"edge_2": {"created_at": {"$gte": "2023-04-01", "$lt": "2023-07-01"}},
"edge_3": {"created_at": {"$gte": "2023-07-01", "$lt": "2023-10-01"}},
"edge_4": {"created_at": {"$gte": "2023-10-01", "$lt": "2024-01-01"}},
}
```
"edge_1": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-04-01"},
],
},
"edge_2": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-07-01"},
],
},
"edge_3": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-07-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-10-01"},
],
},
"edge_4": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-10-01"},
{"field": "meta.created_at", "operator": "<", "value": "2024-01-01"},
],
},
}
```
"""
self.rules = rules
component.set_output_types(self, unmatched=List[Document], **{edge: List[Document] for edge in rules})
@ -43,6 +67,9 @@ class MetadataRouter:
for document in documents:
cur_document_matched = False
for edge, rule in self.rules.items():
if "operator" not in rule:
# Must be a legacy filter, convert it
rule = convert(rule)
if document_matches_filter(rule, document):
output[edge].append(document)
cur_document_matched = True

View File

@ -11,7 +11,7 @@ from haystack.preview import default_from_dict, default_to_dict
from haystack.preview.document_stores.decorator import document_store
from haystack.preview.dataclasses import Document
from haystack.preview.document_stores.protocols import DuplicatePolicy
from haystack.preview.utils.filters import document_matches_filter
from haystack.preview.utils.filters import document_matches_filter, convert
from haystack.preview.document_stores.errors import DuplicateDocumentError, DocumentStoreError
from haystack.preview.utils import expit
@ -92,75 +92,15 @@ class InMemoryDocumentStore:
"""
Returns the documents that match the filters provided.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`,
`"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`,
`"$lte"`) or a metadata field name.
Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata
field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or
(in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default
operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used
as default operation.
Example:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
```
To use the same logical operator multiple times on the same level, logical operators can take a list of
dictionaries as value.
Example:
```python
filters = {
"$or": [
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
}
```
For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol documentation.
:param filters: The filters to apply to the document list.
:return: A list of Documents that match the given filters.
"""
if filters:
return [doc for doc in self.storage.values() if document_matches_filter(conditions=filters, document=doc)]
if "operator" not in filters:
filters = convert(filters)
return [doc for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)]
return list(self.storage.values())
def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> int:
@ -220,9 +160,17 @@ class InMemoryDocumentStore:
if not query:
raise ValueError("Query should be a non-empty string")
content_type_filter = {"$or": {"content": {"$not": None}, "dataframe": {"$not": None}}}
content_type_filter = {
"operator": "OR",
"conditions": [
{"field": "content", "operator": "!=", "value": None},
{"field": "dataframe", "operator": "!=", "value": None},
],
}
if filters:
filters = {"$and": [content_type_filter, filters]}
if "operator" not in filters:
filters = convert(filters)
filters = {"operator": "AND", "conditions": [content_type_filter, filters]}
else:
filters = content_type_filter
all_documents = self.filter_documents(filters=filters)

View File

@ -51,69 +51,64 @@ class DocumentStore(Protocol):
"""
Returns the documents that match the filters provided.
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`,
`"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`,
`"$lte"`) or a metadata field name.
Filters are defined as nested dictionaries that can be of two types:
- Comparison
- Logic
Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata
field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or
(in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default
operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used
as default operation.
Comparison dictionaries must contain the keys:
Example:
- `field`
- `operator`
- `value`
Logic dictionaries must contain the keys:
- `operator`
- `conditions`
The `conditions` key must be a list of dictionaries, either of type Comparison or Logic.
The `operator` value in Comparison dictionaries must be one of:
- `==`
- `!=`
- `>`
- `>=`
- `<`
- `<=`
- `in`
- `not in`
The `operator` values in Logic dictionaries must be one of:
- `NOT`
- `OR`
- `AND`
A simple filter:
```python
filters = {
"$and": {
"type": {"$eq": "article"},
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": {"$in": ["economy", "politics"]},
"publisher": {"$eq": "nytimes"}
}
}
}
# or simpler using default operators
filters = {
"type": "article",
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
"rating": {"$gte": 3},
"$or": {
"genre": ["economy", "politics"],
"publisher": "nytimes"
}
}
filters = {"field": "meta.type", "operator": "==", "value": "article"}
```
To use the same logical operator multiple times on the same level, logical operators can take a list of
dictionaries as value.
Example:
A more complex filter:
```python
filters = {
"$or": [
"operator": "AND",
"conditions": [
{"field": "meta.type", "operator": "==", "value": "article"},
{"field": "meta.date", "operator": ">=", "value": 1420066800},
{"field": "meta.date", "operator": "<", "value": 1609455600},
{"field": "meta.rating", "operator": ">=", "value": 3},
{
"$and": {
"Type": "News Paper",
"Date": {
"$lt": "2019-01-01"
}
}
"operator": "OR",
"conditions": [
{"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]},
{"field": "meta.publisher", "operator": "==", "value": "nytimes"},
],
},
{
"$and": {
"Type": "Blog Post",
"Date": {
"$gte": "2019-01-01"
}
}
}
]
],
}
```
:param filters: the filters to apply to the document list.
:return: a list of Documents that match the given filters.

View File

@ -236,7 +236,7 @@ class LegacyFilterDocumentsInvalidFiltersTest(FilterableDocsFixtureMixin):
@pytest.mark.unit
def test_incorrect_filter_type(self, document_store: DocumentStore, filterable_docs: List[Document]):
document_store.write_documents(filterable_docs)
with pytest.raises(FilterError):
with pytest.raises(ValueError):
document_store.filter_documents(filters="something odd") # type: ignore
@pytest.mark.unit
@ -574,7 +574,9 @@ class LegacyFilterDocumentsLessThanTest(FilterableDocsFixtureMixin):
def test_lt_filter(self, document_store: DocumentStore, filterable_docs: List[Document]):
document_store.write_documents(filterable_docs)
result = document_store.filter_documents(filters={"number": {"$lt": 0.0}})
assert result == [doc for doc in filterable_docs if "number" in doc.meta and doc.meta["number"] < 0]
assert result == [
doc for doc in filterable_docs if doc.meta.get("number") is not None and doc.meta["number"] < 0
]
@pytest.mark.unit
def test_lt_filter_non_numeric(self, document_store: DocumentStore, filterable_docs: List[Document]):
@ -614,7 +616,9 @@ class LegacyFilterDocumentsLessThanEqualTest(FilterableDocsFixtureMixin):
def test_lte_filter(self, document_store: DocumentStore, filterable_docs: List[Document]):
document_store.write_documents(filterable_docs)
result = document_store.filter_documents(filters={"number": {"$lte": 2.0}})
assert result == [doc for doc in filterable_docs if "number" in doc.meta and doc.meta["number"] <= 2.0]
assert result == [
doc for doc in filterable_docs if doc.meta.get("number") is not None and doc.meta["number"] <= 2.0
]
@pytest.mark.unit
def test_lte_filter_non_numeric(self, document_store: DocumentStore, filterable_docs: List[Document]):
@ -658,7 +662,8 @@ class LegacyFilterDocumentsSimpleLogicalTest(FilterableDocsFixtureMixin):
assert result == [
doc
for doc in filterable_docs
if (("number" in doc.meta and doc.meta["number"] < 1) or doc.meta.get("name") in ["name_0", "name_1"])
if (doc.meta.get("number") is not None and doc.meta["number"] < 1)
or doc.meta.get("name") in ["name_0", "name_1"]
]
@pytest.mark.unit
@ -733,7 +738,10 @@ class LegacyFilterDocumentsNestedLogicalTest(FilterableDocsFixtureMixin):
assert result == [
doc
for doc in filterable_docs
if (doc.meta.get("name") in ["name_0", "name_1"] or ("number" in doc.meta and doc.meta["number"] < 1))
if (
doc.meta.get("name") in ["name_0", "name_1"]
or (doc.meta.get("number") is not None and doc.meta["number"] < 1)
)
]
@pytest.mark.unit
@ -783,11 +791,8 @@ class LegacyFilterDocumentsNestedLogicalTest(FilterableDocsFixtureMixin):
doc
for doc in filterable_docs
if (
("number" in doc.meta and doc.meta["number"] < 1)
or (
doc.meta.get("name") in ["name_0", "name_1"]
and ("chapter" in doc.meta and doc.meta["chapter"] != "intro")
)
(doc.meta.get("number") is not None and doc.meta["number"] < 1)
or (doc.meta.get("name") in ["name_0", "name_1"] and (doc.meta.get("chapter") != "intro"))
)
]

View File

@ -1,297 +1,174 @@
from typing import List, Any, Union, Dict
from dataclasses import fields
from datetime import datetime
import numpy as np
import pandas as pd
from haystack.preview.dataclasses import Document
from haystack.preview.errors import FilterError
GT_TYPES = (int, float, np.number)
IN_TYPES = (list, set, tuple)
def not_operation(conditions: List[Any], document: Document, _current_key: str):
def document_matches_filter(filters: Dict[str, Any], document: Document) -> bool:
"""
Applies a NOT to all the nested conditions.
:param conditions: the filters dictionary.
:param document: the document to test.
:param _current_key: internal, don't use.
:return: True if the document matches the negated filters, False otherwise
Return whether `filters` match the Document.
For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol documentation.
"""
return not and_operation(conditions=conditions, document=document, _current_key=_current_key)
if "field" in filters:
return _comparison_condition(filters, document)
return _logic_condition(filters, document)
def and_operation(conditions: List[Any], document: Document, _current_key: str):
"""
Applies an AND to all the nested conditions.
:param conditions: the filters dictionary.
:param document: the document to test.
:param _current_key: internal, don't use.
:return: True if the document matches all the filters, False otherwise
"""
return all(
document_matches_filter(conditions=condition, document=document, _current_key=_current_key)
for condition in conditions
)
def _and(document: Document, conditions: List[Dict[str, Any]]) -> bool:
return all(_comparison_condition(condition, document) for condition in conditions)
def or_operation(conditions: List[Any], document: Document, _current_key: str):
"""
Applies an OR to all the nested conditions.
:param conditions: the filters dictionary.
:param document: the document to test.
:param _current_key: internal, don't use.
:return: True if the document matches any of the filters, False otherwise
"""
return any(
document_matches_filter(conditions=condition, document=document, _current_key=_current_key)
for condition in conditions
)
def _or(document: Document, conditions: List[Dict[str, Any]]) -> bool:
return any(_comparison_condition(condition, document) for condition in conditions)
def _safe_eq(first: Any, second: Any) -> bool:
"""
Compares objects for equality, even np.ndarrays and pandas DataFrames.
"""
if isinstance(first, pd.DataFrame):
first = first.to_json()
if isinstance(second, pd.DataFrame):
second = second.to_json()
if isinstance(first, np.ndarray):
first = first.tolist()
if isinstance(second, np.ndarray):
second = second.tolist()
return first == second
def _not(document: Document, conditions: List[Dict[str, Any]]) -> bool:
return not _and(document, conditions)
def _safe_gt(first: Any, second: Any) -> bool:
"""
Checks if first is bigger than second.
LOGICAL_OPERATORS = {"NOT": _not, "OR": _or, "AND": _and}
Works only for numerical values and dates in ISO format (YYYY-MM-DD). Strings, lists, tables and tensors all raise exceptions.
"""
if not isinstance(first, GT_TYPES) or not isinstance(second, GT_TYPES):
def _equal(document_value: Any, filter_value: Any) -> bool:
if isinstance(document_value, pd.DataFrame):
document_value = document_value.to_json()
if isinstance(filter_value, pd.DataFrame):
filter_value = filter_value.to_json()
return document_value == filter_value
def _not_equal(document_value: Any, filter_value: Any) -> bool:
return not _equal(document_value=document_value, filter_value=filter_value)
def _greater_than(document_value: Any, filter_value: Any) -> bool:
if document_value is None or filter_value is None:
# We can't compare None values reliably using operators '>', '>=', '<', '<='
return False
if isinstance(document_value, str) or isinstance(filter_value, str):
try:
first = datetime.fromisoformat(first)
second = datetime.fromisoformat(second)
except (ValueError, TypeError):
raise FilterError(
f"Can't evaluate '{type(first).__name__} > {type(second).__name__}'. "
f"Convert these values into one of the following types: {[type_.__name__ for type_ in GT_TYPES]} "
f"or a datetime string in ISO 8601 format."
document_value = datetime.fromisoformat(document_value)
filter_value = datetime.fromisoformat(filter_value)
except (ValueError, TypeError) as exc:
msg = (
"Can't compare strings using operators '>', '>=', '<', '<='. "
"Strings are only comparable if they are ISO formatted dates."
)
return bool(first > second)
raise FilterError(msg) from exc
if type(filter_value) in [list, pd.DataFrame]:
msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='"
raise FilterError(msg)
return document_value > filter_value
def eq_operation(fields, field_name, value):
"""
Checks for equality between the document's field value value and a fixed value.
:param fields: all the document's field value
:param field_name: the field to test
:param value: the fixed value to compare against
:return: True if the values are equal, False otherwise
"""
if not field_name in fields:
def _greater_than_equal(document_value: Any, filter_value: Any) -> bool:
if document_value is None or filter_value is None:
# We can't compare None values reliably using operators '>', '>=', '<', '<='
return False
return _safe_eq(fields[field_name], value)
return _equal(document_value=document_value, filter_value=filter_value) or _greater_than(
document_value=document_value, filter_value=filter_value
)
def in_operation(fields, field_name, value):
"""
Checks for whether the document's field value value is present into the given list.
:param fields: all the document's field value
:param field_name: the field to test
:param value; the fixed value to compare against
:return: True if the document's value is included in the given list, False otherwise
"""
if not field_name in fields:
def _less_than(document_value: Any, filter_value: Any) -> bool:
if document_value is None or filter_value is None:
# We can't compare None values reliably using operators '>', '>=', '<', '<='
return False
if not isinstance(value, IN_TYPES):
raise FilterError("$in accepts only iterable values like lists, sets and tuples.")
return any(_safe_eq(fields[field_name], v) for v in value)
return not _greater_than_equal(document_value=document_value, filter_value=filter_value)
def ne_operation(fields, field_name, value):
"""
Checks for inequality between the document's field value value and a fixed value.
:param fields: all the document's field value
:param field_name: the field to test
:param value; the fixed value to compare against
:return: True if the values are different, False otherwise
"""
return not eq_operation(fields, field_name, value)
def nin_operation(fields, field_name, value):
"""
Checks whether the document's field value value is absent from the given list.
:param fields: all the document's field value
:param field_name: the field to test
:param value; the fixed value to compare against
:return: True if the document's value is not included in the given list, False otherwise
"""
return not in_operation(fields, field_name, value)
def gt_operation(fields, field_name, value):
"""
Checks whether the document's field value value is (strictly) larger than the given value.
:param fields: all the document's field value
:param field_name: the field to test
:param value; the fixed value to compare against
:return: True if the document's value is strictly larger than the fixed value, False otherwise
"""
if not field_name in fields:
def _less_than_equal(document_value: Any, filter_value: Any) -> bool:
if document_value is None or filter_value is None:
# We can't compare None values reliably using operators '>', '>=', '<', '<='
return False
return _safe_gt(fields[field_name], value)
return not _greater_than(document_value=document_value, filter_value=filter_value)
def gte_operation(fields, field_name, value):
"""
Checks whether the document's field value value is larger than or equal to the given value.
:param fields: all the document's field value
:param field_name: the field to test
:param value; the fixed value to compare against
:return: True if the document's value is larger than or equal to the fixed value, False otherwise
"""
return gt_operation(fields, field_name, value) or eq_operation(fields, field_name, value)
def _in(document_value: Any, filter_value: Any) -> bool:
if not isinstance(filter_value, list):
msg = (
f"Filter value must be a `list` when using operator 'in' or 'not in', received type '{type(filter_value)}'"
)
raise FilterError(msg)
return any(_equal(e, document_value) for e in filter_value)
def lt_operation(fields, field_name, value):
"""
Checks whether the document's field value value is (strictly) smaller than the given value.
:param fields: all the document's field value
:param field_name: the field to test
:param value; the fixed value to compare against
:return: True if the document's value is strictly smaller than the fixed value, False otherwise
"""
if not field_name in fields:
return False
return not _safe_gt(fields[field_name], value) and not _safe_eq(fields[field_name], value)
def _not_in(document_value: Any, filter_value: Any) -> bool:
return not _in(document_value=document_value, filter_value=filter_value)
def lte_operation(fields, field_name, value):
"""
Checks whether the document's field value value is smaller than or equal to the given value.
:param fields: all the document's field value
:param field_name: the field to test
:param value; the fixed value to compare against
:return: True if the document's value is smaller than or equal to the fixed value, False otherwise
"""
if not field_name in fields:
return False
return not _safe_gt(fields[field_name], value)
LOGICAL_STATEMENTS = {"$not": not_operation, "$and": and_operation, "$or": or_operation}
OPERATORS = {
"$eq": eq_operation,
"$in": in_operation,
"$ne": ne_operation,
"$nin": nin_operation,
"$gt": gt_operation,
"$gte": gte_operation,
"$lt": lt_operation,
"$lte": lte_operation,
COMPARISON_OPERATORS = {
"==": _equal,
"!=": _not_equal,
">": _greater_than,
">=": _greater_than_equal,
"<": _less_than,
"<=": _less_than_equal,
"in": _in,
"not in": _not_in,
}
RESERVED_KEYS = [*LOGICAL_STATEMENTS.keys(), *OPERATORS.keys()]
def document_matches_filter(conditions: Union[Dict, List], document: Document, _current_key=None):
"""
Check if a document's metadata matches the provided filter conditions.
This function evaluates the specified conditions against the metadata of the given document
and returns True if the conditions are met, otherwise it returns False.
:param conditions: A dictionary or list containing filter conditions to be applied to the document's metadata.
:param document: The document whose metadata will be evaluated against the conditions.
:param _current_key: internal parameter, don't use.
:return: True if the document's metadata matches the filter conditions, False otherwise.
"""
if isinstance(conditions, dict):
# Check for malformed filters, like {"name": {"year": "2020"}}
if _current_key and any(key not in RESERVED_KEYS for key in conditions.keys()):
raise FilterError(
f"This filter ({{{_current_key}: {conditions}}}) seems to be malformed. "
"Comparisons between dictionaries are not currently supported. "
"Check the documentation to learn more about filters syntax."
)
if len(conditions.keys()) > 1:
# The default operation for a list of sibling conditions is $and
return and_operation(conditions=_list_conditions(conditions), document=document, _current_key=_current_key)
field_key, field_value = list(conditions.items())[0]
# Nested logical statement ($and, $or, $not)
if field_key in LOGICAL_STATEMENTS.keys():
return LOGICAL_STATEMENTS[field_key](
conditions=_list_conditions(field_value), document=document, _current_key=_current_key
)
# A comparison operator ($eq, $in, $gte, ...)
if field_key in OPERATORS.keys():
if not _current_key:
raise FilterError(
"Filters can't start with an operator like $eq and $in. You have to specify the field name first. "
"See the examples in the documentation."
)
return OPERATORS[field_key](fields=document.to_dict(), field_name=_current_key, value=field_value)
# Otherwise fall back to the defaults
conditions = _list_conditions(field_value)
_current_key = field_key
# Defaults for implicit filters
if isinstance(conditions, list):
if all(isinstance(cond, dict) for cond in conditions):
# The default operation for a list of sibling conditions is $and
return and_operation(conditions=_list_conditions(conditions), document=document, _current_key=_current_key)
else:
# The default operator for a {key: [value1, value2]} filter is $in
return in_operation(fields=document.to_dict(), field_name=_current_key, value=conditions)
if _current_key:
# The default operator for a {key: value} filter is $eq
return eq_operation(fields=document.to_dict(), field_name=_current_key, value=conditions)
raise FilterError("Filters must be dictionaries or lists. See the examples in the documentation.")
def _logic_condition(condition: Dict[str, Any], document: Document) -> bool:
if "operator" not in condition:
msg = f"'operator' key missing in {condition}"
raise FilterError(msg)
if "conditions" not in condition:
msg = f"'conditions' key missing in {condition}"
raise FilterError(msg)
operator: str = condition["operator"]
conditions: List[Dict[str, Any]] = condition["conditions"]
return LOGICAL_OPERATORS[operator](document, conditions)
def _list_conditions(conditions: Any) -> List[Any]:
"""
Make sure all nested conditions are not dictionaries or single values, but always lists.
def _comparison_condition(condition: Dict[str, Any], document: Document) -> bool:
if "field" not in condition:
# 'field' key is only found in comparison dictionaries.
# We assume this is a logic dictionary since it's not present.
return _logic_condition(condition, document)
field: str = condition["field"]
:param conditions: the conditions to transform into a list
:returns: a list of filters
"""
if isinstance(conditions, list):
return conditions
if isinstance(conditions, dict):
return [{key: value} for key, value in conditions.items()]
return [conditions]
if "operator" not in condition:
msg = f"'operator' key missing in {condition}"
raise FilterError(msg)
if "value" not in condition:
msg = f"'value' key missing in {condition}"
raise FilterError(msg)
if "." in field:
# Handles fields formatted like so:
# 'meta.person.name'
parts = field.split(".")
document_value = getattr(document, parts[0])
for part in parts[1:]:
if part not in document_value:
# If a field is not found we treat it as None
document_value = None
break
document_value = document_value[part]
elif field not in [f.name for f in fields(document)]:
# Converted legacy filters don't add the `meta.` prefix, so we assume
# that all filter fields that are not actual fields in Document are converted
# filters.
#
# We handle this to avoid breaking compatibility with converted legacy filters.
# This will be removed as soon as we stop supporting legacy filters.
document_value = document.meta.get(field)
else:
document_value = getattr(document, field)
operator: str = condition["operator"]
filter_value: Any = condition["value"]
return COMPARISON_OPERATORS[operator](filter_value=filter_value, document_value=document_value)
def convert(filters: Dict[str, Any]) -> Dict[str, Any]:

View File

@ -0,0 +1,87 @@
---
prelude: >
With proposal [#6001](https://github.com/deepset-ai/haystack/pull/6001) we introduced a better specification to declare filters in Haystack 2.x.
The new syntax is a bit more verbose but less confusing and ambiguous as there are no implicit operators.
This will simplify conversion from this common syntax to a Document Store specific filtering logic, so it will ease
development of new Document Store.
Since everything must be declared explicitly it will also make it easier for user to understand the filters just
by reading them.
The full specification is as follow.
---
Filters top level must be a dictionary.
There are two types of dictionaries:
- Comparison
- Logic
Top level can be either be a Comparison or Logic dictionary.
Comparison dictionaries must contain the keys:
- `field`
- `operator`
- `value`
Logic dictionaries must contain the keys:
- `operator`
- `conditions`
`conditions` key must be a list of dictionaries, either Comparison or Logic.
`operator` values in Comparison dictionaries must be:
- `==`
- `!=`
- `>`
- `>=`
- `<`
- `<=`
- `in`
- `not in`
`operator` values in Logic dictionaries must be:
- `NOT`
- `OR`
- `AND`
---
A simple filter:
```python
filters = {"field": "meta.type", "operator": "==", "value": "article"}
```
A more complex filter:
```python
filters = {
"operator": "AND",
"conditions": [
{"field": "meta.type", "operator": "==", "value": "article"},
{"field": "meta.date", "operator": ">=", "value": 1420066800},
{"field": "meta.date", "operator": "<", "value": 1609455600},
{"field": "meta.rating", "operator": ">=", "value": 3},
{
"operator": "OR",
"conditions": [
{"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]},
{"field": "meta.publisher", "operator": "==", "value": "nytimes"},
],
},
],
}
```
---
To avoid causing too much disruption for users using legacy filters we'll keep supporting them for the time being.
We also provide a utility `convert` function for developers implementing their Document Store to do the same.
preview:
- |
Refactored `InMemoryDocumentStore` and `MetadataRouter` filtering logic to support new filters declaration.

View File

@ -8,8 +8,20 @@ class TestMetadataRouter:
@pytest.mark.unit
def test_run(self):
rules = {
"edge_1": {"created_at": {"$gte": "2023-01-01", "$lt": "2023-04-01"}},
"edge_2": {"created_at": {"$gte": "2023-04-01", "$lt": "2023-07-01"}},
"edge_1": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-04-01"},
],
},
"edge_2": {
"operator": "AND",
"conditions": [
{"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"},
{"field": "meta.created_at", "operator": "<", "value": "2023-07-01"},
],
},
}
router = MetadataRouter(rules=rules)
documents = [

View File

@ -146,10 +146,6 @@ class TestMemoryDocumentStore(DocumentStoreBaseTests): # pylint: disable=R0904
results = document_store.bm25_retrieval(query="Python", top_k=1)
assert results[0].content == "Python is a popular programming language"
@pytest.mark.skip(reason="Filter is not working properly, see https://github.com/deepset-ai/haystack/issues/6153")
def test_eq_filter_embedding(self, document_store: InMemoryDocumentStore, filterable_docs):
pass
# Test a query, add a new document and make sure results are appropriately updated
@pytest.mark.unit
def test_bm25_retrieval_with_updated_docs(self, document_store: InMemoryDocumentStore):

File diff suppressed because it is too large Load Diff