mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-29 16:59:47 +00:00
refactor: Add support for new filters declaration (#6397)
* Rework filter logic for InMemoryDocumentStore to support new filters declaration * Fix legacy filters tests * Simplify logic and handle dates comparison * Rework MetadataRouter to support new filters * Update docstrings * Add release notes * Fix linting * Avoid duplicating filters specifications * Handle corner case * Simplify docstring * Fix filters logic and tests * Fix Document Store testing legacy filters tests
This commit is contained in:
parent
28c2b09d90
commit
fd16ec63cb
@ -18,7 +18,7 @@ def test_preprocessing_pipeline(tmp_path):
|
||||
preprocessing_pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
|
||||
preprocessing_pipeline.add_component(instance=DocumentLanguageClassifier(), name="language_classifier")
|
||||
preprocessing_pipeline.add_component(
|
||||
instance=MetadataRouter(rules={"en": {"language": {"$eq": "en"}}}), name="router"
|
||||
instance=MetadataRouter(rules={"en": {"field": "language", "operator": "==", "value": "en"}}), name="router"
|
||||
)
|
||||
preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
|
||||
preprocessing_pipeline.add_component(
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
from typing import Dict, List
|
||||
|
||||
from haystack.preview import component, Document
|
||||
from haystack.preview.utils.filters import document_matches_filter
|
||||
from haystack.preview.utils.filters import document_matches_filter, convert
|
||||
|
||||
|
||||
@component
|
||||
@ -19,12 +19,36 @@ class MetadataRouter:
|
||||
follow the format of filtering expressions in Haystack. For example:
|
||||
```python
|
||||
{
|
||||
"edge_1": {"created_at": {"$gte": "2023-01-01", "$lt": "2023-04-01"}},
|
||||
"edge_2": {"created_at": {"$gte": "2023-04-01", "$lt": "2023-07-01"}},
|
||||
"edge_3": {"created_at": {"$gte": "2023-07-01", "$lt": "2023-10-01"}},
|
||||
"edge_4": {"created_at": {"$gte": "2023-10-01", "$lt": "2024-01-01"}},
|
||||
}
|
||||
```
|
||||
"edge_1": {
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"},
|
||||
{"field": "meta.created_at", "operator": "<", "value": "2023-04-01"},
|
||||
],
|
||||
},
|
||||
"edge_2": {
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"},
|
||||
{"field": "meta.created_at", "operator": "<", "value": "2023-07-01"},
|
||||
],
|
||||
},
|
||||
"edge_3": {
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "meta.created_at", "operator": ">=", "value": "2023-07-01"},
|
||||
{"field": "meta.created_at", "operator": "<", "value": "2023-10-01"},
|
||||
],
|
||||
},
|
||||
"edge_4": {
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "meta.created_at", "operator": ">=", "value": "2023-10-01"},
|
||||
{"field": "meta.created_at", "operator": "<", "value": "2024-01-01"},
|
||||
],
|
||||
},
|
||||
}
|
||||
```
|
||||
"""
|
||||
self.rules = rules
|
||||
component.set_output_types(self, unmatched=List[Document], **{edge: List[Document] for edge in rules})
|
||||
@ -43,6 +67,9 @@ class MetadataRouter:
|
||||
for document in documents:
|
||||
cur_document_matched = False
|
||||
for edge, rule in self.rules.items():
|
||||
if "operator" not in rule:
|
||||
# Must be a legacy filter, convert it
|
||||
rule = convert(rule)
|
||||
if document_matches_filter(rule, document):
|
||||
output[edge].append(document)
|
||||
cur_document_matched = True
|
||||
|
||||
@ -11,7 +11,7 @@ from haystack.preview import default_from_dict, default_to_dict
|
||||
from haystack.preview.document_stores.decorator import document_store
|
||||
from haystack.preview.dataclasses import Document
|
||||
from haystack.preview.document_stores.protocols import DuplicatePolicy
|
||||
from haystack.preview.utils.filters import document_matches_filter
|
||||
from haystack.preview.utils.filters import document_matches_filter, convert
|
||||
from haystack.preview.document_stores.errors import DuplicateDocumentError, DocumentStoreError
|
||||
from haystack.preview.utils import expit
|
||||
|
||||
@ -92,75 +92,15 @@ class InMemoryDocumentStore:
|
||||
"""
|
||||
Returns the documents that match the filters provided.
|
||||
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`,
|
||||
`"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`,
|
||||
`"$lte"`) or a metadata field name.
|
||||
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata
|
||||
field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or
|
||||
(in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default
|
||||
operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used
|
||||
as default operation.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
# or simpler using default operators
|
||||
filters = {
|
||||
"type": "article",
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": ["economy", "politics"],
|
||||
"publisher": "nytimes"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
To use the same logical operator multiple times on the same level, logical operators can take a list of
|
||||
dictionaries as value.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
filters = {
|
||||
"$or": [
|
||||
{
|
||||
"$and": {
|
||||
"Type": "News Paper",
|
||||
"Date": {
|
||||
"$lt": "2019-01-01"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$and": {
|
||||
"Type": "Blog Post",
|
||||
"Date": {
|
||||
"$gte": "2019-01-01"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol documentation.
|
||||
|
||||
:param filters: The filters to apply to the document list.
|
||||
:return: A list of Documents that match the given filters.
|
||||
"""
|
||||
if filters:
|
||||
return [doc for doc in self.storage.values() if document_matches_filter(conditions=filters, document=doc)]
|
||||
if "operator" not in filters:
|
||||
filters = convert(filters)
|
||||
return [doc for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)]
|
||||
return list(self.storage.values())
|
||||
|
||||
def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> int:
|
||||
@ -220,9 +160,17 @@ class InMemoryDocumentStore:
|
||||
if not query:
|
||||
raise ValueError("Query should be a non-empty string")
|
||||
|
||||
content_type_filter = {"$or": {"content": {"$not": None}, "dataframe": {"$not": None}}}
|
||||
content_type_filter = {
|
||||
"operator": "OR",
|
||||
"conditions": [
|
||||
{"field": "content", "operator": "!=", "value": None},
|
||||
{"field": "dataframe", "operator": "!=", "value": None},
|
||||
],
|
||||
}
|
||||
if filters:
|
||||
filters = {"$and": [content_type_filter, filters]}
|
||||
if "operator" not in filters:
|
||||
filters = convert(filters)
|
||||
filters = {"operator": "AND", "conditions": [content_type_filter, filters]}
|
||||
else:
|
||||
filters = content_type_filter
|
||||
all_documents = self.filter_documents(filters=filters)
|
||||
|
||||
@ -51,69 +51,64 @@ class DocumentStore(Protocol):
|
||||
"""
|
||||
Returns the documents that match the filters provided.
|
||||
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`,
|
||||
`"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`,
|
||||
`"$lte"`) or a metadata field name.
|
||||
Filters are defined as nested dictionaries that can be of two types:
|
||||
- Comparison
|
||||
- Logic
|
||||
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata
|
||||
field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or
|
||||
(in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default
|
||||
operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used
|
||||
as default operation.
|
||||
Comparison dictionaries must contain the keys:
|
||||
|
||||
Example:
|
||||
- `field`
|
||||
- `operator`
|
||||
- `value`
|
||||
|
||||
Logic dictionaries must contain the keys:
|
||||
|
||||
- `operator`
|
||||
- `conditions`
|
||||
|
||||
The `conditions` key must be a list of dictionaries, either of type Comparison or Logic.
|
||||
|
||||
The `operator` value in Comparison dictionaries must be one of:
|
||||
|
||||
- `==`
|
||||
- `!=`
|
||||
- `>`
|
||||
- `>=`
|
||||
- `<`
|
||||
- `<=`
|
||||
- `in`
|
||||
- `not in`
|
||||
|
||||
The `operator` values in Logic dictionaries must be one of:
|
||||
|
||||
- `NOT`
|
||||
- `OR`
|
||||
- `AND`
|
||||
|
||||
|
||||
A simple filter:
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
# or simpler using default operators
|
||||
filters = {
|
||||
"type": "article",
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": ["economy", "politics"],
|
||||
"publisher": "nytimes"
|
||||
}
|
||||
}
|
||||
filters = {"field": "meta.type", "operator": "==", "value": "article"}
|
||||
```
|
||||
|
||||
To use the same logical operator multiple times on the same level, logical operators can take a list of
|
||||
dictionaries as value.
|
||||
|
||||
Example:
|
||||
|
||||
A more complex filter:
|
||||
```python
|
||||
filters = {
|
||||
"$or": [
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "meta.type", "operator": "==", "value": "article"},
|
||||
{"field": "meta.date", "operator": ">=", "value": 1420066800},
|
||||
{"field": "meta.date", "operator": "<", "value": 1609455600},
|
||||
{"field": "meta.rating", "operator": ">=", "value": 3},
|
||||
{
|
||||
"$and": {
|
||||
"Type": "News Paper",
|
||||
"Date": {
|
||||
"$lt": "2019-01-01"
|
||||
}
|
||||
}
|
||||
"operator": "OR",
|
||||
"conditions": [
|
||||
{"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]},
|
||||
{"field": "meta.publisher", "operator": "==", "value": "nytimes"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"$and": {
|
||||
"Type": "Blog Post",
|
||||
"Date": {
|
||||
"$gte": "2019-01-01"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
],
|
||||
}
|
||||
```
|
||||
|
||||
:param filters: the filters to apply to the document list.
|
||||
:return: a list of Documents that match the given filters.
|
||||
|
||||
@ -236,7 +236,7 @@ class LegacyFilterDocumentsInvalidFiltersTest(FilterableDocsFixtureMixin):
|
||||
@pytest.mark.unit
|
||||
def test_incorrect_filter_type(self, document_store: DocumentStore, filterable_docs: List[Document]):
|
||||
document_store.write_documents(filterable_docs)
|
||||
with pytest.raises(FilterError):
|
||||
with pytest.raises(ValueError):
|
||||
document_store.filter_documents(filters="something odd") # type: ignore
|
||||
|
||||
@pytest.mark.unit
|
||||
@ -574,7 +574,9 @@ class LegacyFilterDocumentsLessThanTest(FilterableDocsFixtureMixin):
|
||||
def test_lt_filter(self, document_store: DocumentStore, filterable_docs: List[Document]):
|
||||
document_store.write_documents(filterable_docs)
|
||||
result = document_store.filter_documents(filters={"number": {"$lt": 0.0}})
|
||||
assert result == [doc for doc in filterable_docs if "number" in doc.meta and doc.meta["number"] < 0]
|
||||
assert result == [
|
||||
doc for doc in filterable_docs if doc.meta.get("number") is not None and doc.meta["number"] < 0
|
||||
]
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_lt_filter_non_numeric(self, document_store: DocumentStore, filterable_docs: List[Document]):
|
||||
@ -614,7 +616,9 @@ class LegacyFilterDocumentsLessThanEqualTest(FilterableDocsFixtureMixin):
|
||||
def test_lte_filter(self, document_store: DocumentStore, filterable_docs: List[Document]):
|
||||
document_store.write_documents(filterable_docs)
|
||||
result = document_store.filter_documents(filters={"number": {"$lte": 2.0}})
|
||||
assert result == [doc for doc in filterable_docs if "number" in doc.meta and doc.meta["number"] <= 2.0]
|
||||
assert result == [
|
||||
doc for doc in filterable_docs if doc.meta.get("number") is not None and doc.meta["number"] <= 2.0
|
||||
]
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_lte_filter_non_numeric(self, document_store: DocumentStore, filterable_docs: List[Document]):
|
||||
@ -658,7 +662,8 @@ class LegacyFilterDocumentsSimpleLogicalTest(FilterableDocsFixtureMixin):
|
||||
assert result == [
|
||||
doc
|
||||
for doc in filterable_docs
|
||||
if (("number" in doc.meta and doc.meta["number"] < 1) or doc.meta.get("name") in ["name_0", "name_1"])
|
||||
if (doc.meta.get("number") is not None and doc.meta["number"] < 1)
|
||||
or doc.meta.get("name") in ["name_0", "name_1"]
|
||||
]
|
||||
|
||||
@pytest.mark.unit
|
||||
@ -733,7 +738,10 @@ class LegacyFilterDocumentsNestedLogicalTest(FilterableDocsFixtureMixin):
|
||||
assert result == [
|
||||
doc
|
||||
for doc in filterable_docs
|
||||
if (doc.meta.get("name") in ["name_0", "name_1"] or ("number" in doc.meta and doc.meta["number"] < 1))
|
||||
if (
|
||||
doc.meta.get("name") in ["name_0", "name_1"]
|
||||
or (doc.meta.get("number") is not None and doc.meta["number"] < 1)
|
||||
)
|
||||
]
|
||||
|
||||
@pytest.mark.unit
|
||||
@ -783,11 +791,8 @@ class LegacyFilterDocumentsNestedLogicalTest(FilterableDocsFixtureMixin):
|
||||
doc
|
||||
for doc in filterable_docs
|
||||
if (
|
||||
("number" in doc.meta and doc.meta["number"] < 1)
|
||||
or (
|
||||
doc.meta.get("name") in ["name_0", "name_1"]
|
||||
and ("chapter" in doc.meta and doc.meta["chapter"] != "intro")
|
||||
)
|
||||
(doc.meta.get("number") is not None and doc.meta["number"] < 1)
|
||||
or (doc.meta.get("name") in ["name_0", "name_1"] and (doc.meta.get("chapter") != "intro"))
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@ -1,297 +1,174 @@
|
||||
from typing import List, Any, Union, Dict
|
||||
from dataclasses import fields
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from haystack.preview.dataclasses import Document
|
||||
from haystack.preview.errors import FilterError
|
||||
|
||||
|
||||
GT_TYPES = (int, float, np.number)
|
||||
IN_TYPES = (list, set, tuple)
|
||||
|
||||
|
||||
def not_operation(conditions: List[Any], document: Document, _current_key: str):
|
||||
def document_matches_filter(filters: Dict[str, Any], document: Document) -> bool:
|
||||
"""
|
||||
Applies a NOT to all the nested conditions.
|
||||
|
||||
:param conditions: the filters dictionary.
|
||||
:param document: the document to test.
|
||||
:param _current_key: internal, don't use.
|
||||
:return: True if the document matches the negated filters, False otherwise
|
||||
Return whether `filters` match the Document.
|
||||
For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol documentation.
|
||||
"""
|
||||
return not and_operation(conditions=conditions, document=document, _current_key=_current_key)
|
||||
if "field" in filters:
|
||||
return _comparison_condition(filters, document)
|
||||
return _logic_condition(filters, document)
|
||||
|
||||
|
||||
def and_operation(conditions: List[Any], document: Document, _current_key: str):
|
||||
"""
|
||||
Applies an AND to all the nested conditions.
|
||||
|
||||
:param conditions: the filters dictionary.
|
||||
:param document: the document to test.
|
||||
:param _current_key: internal, don't use.
|
||||
:return: True if the document matches all the filters, False otherwise
|
||||
"""
|
||||
return all(
|
||||
document_matches_filter(conditions=condition, document=document, _current_key=_current_key)
|
||||
for condition in conditions
|
||||
)
|
||||
def _and(document: Document, conditions: List[Dict[str, Any]]) -> bool:
|
||||
return all(_comparison_condition(condition, document) for condition in conditions)
|
||||
|
||||
|
||||
def or_operation(conditions: List[Any], document: Document, _current_key: str):
|
||||
"""
|
||||
Applies an OR to all the nested conditions.
|
||||
|
||||
:param conditions: the filters dictionary.
|
||||
:param document: the document to test.
|
||||
:param _current_key: internal, don't use.
|
||||
:return: True if the document matches any of the filters, False otherwise
|
||||
"""
|
||||
return any(
|
||||
document_matches_filter(conditions=condition, document=document, _current_key=_current_key)
|
||||
for condition in conditions
|
||||
)
|
||||
def _or(document: Document, conditions: List[Dict[str, Any]]) -> bool:
|
||||
return any(_comparison_condition(condition, document) for condition in conditions)
|
||||
|
||||
|
||||
def _safe_eq(first: Any, second: Any) -> bool:
|
||||
"""
|
||||
Compares objects for equality, even np.ndarrays and pandas DataFrames.
|
||||
"""
|
||||
|
||||
if isinstance(first, pd.DataFrame):
|
||||
first = first.to_json()
|
||||
|
||||
if isinstance(second, pd.DataFrame):
|
||||
second = second.to_json()
|
||||
|
||||
if isinstance(first, np.ndarray):
|
||||
first = first.tolist()
|
||||
|
||||
if isinstance(second, np.ndarray):
|
||||
second = second.tolist()
|
||||
|
||||
return first == second
|
||||
def _not(document: Document, conditions: List[Dict[str, Any]]) -> bool:
|
||||
return not _and(document, conditions)
|
||||
|
||||
|
||||
def _safe_gt(first: Any, second: Any) -> bool:
|
||||
"""
|
||||
Checks if first is bigger than second.
|
||||
LOGICAL_OPERATORS = {"NOT": _not, "OR": _or, "AND": _and}
|
||||
|
||||
Works only for numerical values and dates in ISO format (YYYY-MM-DD). Strings, lists, tables and tensors all raise exceptions.
|
||||
"""
|
||||
if not isinstance(first, GT_TYPES) or not isinstance(second, GT_TYPES):
|
||||
|
||||
def _equal(document_value: Any, filter_value: Any) -> bool:
|
||||
if isinstance(document_value, pd.DataFrame):
|
||||
document_value = document_value.to_json()
|
||||
|
||||
if isinstance(filter_value, pd.DataFrame):
|
||||
filter_value = filter_value.to_json()
|
||||
|
||||
return document_value == filter_value
|
||||
|
||||
|
||||
def _not_equal(document_value: Any, filter_value: Any) -> bool:
|
||||
return not _equal(document_value=document_value, filter_value=filter_value)
|
||||
|
||||
|
||||
def _greater_than(document_value: Any, filter_value: Any) -> bool:
|
||||
if document_value is None or filter_value is None:
|
||||
# We can't compare None values reliably using operators '>', '>=', '<', '<='
|
||||
return False
|
||||
|
||||
if isinstance(document_value, str) or isinstance(filter_value, str):
|
||||
try:
|
||||
first = datetime.fromisoformat(first)
|
||||
second = datetime.fromisoformat(second)
|
||||
except (ValueError, TypeError):
|
||||
raise FilterError(
|
||||
f"Can't evaluate '{type(first).__name__} > {type(second).__name__}'. "
|
||||
f"Convert these values into one of the following types: {[type_.__name__ for type_ in GT_TYPES]} "
|
||||
f"or a datetime string in ISO 8601 format."
|
||||
document_value = datetime.fromisoformat(document_value)
|
||||
filter_value = datetime.fromisoformat(filter_value)
|
||||
except (ValueError, TypeError) as exc:
|
||||
msg = (
|
||||
"Can't compare strings using operators '>', '>=', '<', '<='. "
|
||||
"Strings are only comparable if they are ISO formatted dates."
|
||||
)
|
||||
return bool(first > second)
|
||||
raise FilterError(msg) from exc
|
||||
if type(filter_value) in [list, pd.DataFrame]:
|
||||
msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='"
|
||||
raise FilterError(msg)
|
||||
return document_value > filter_value
|
||||
|
||||
|
||||
def eq_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks for equality between the document's field value value and a fixed value.
|
||||
|
||||
:param fields: all the document's field value
|
||||
:param field_name: the field to test
|
||||
:param value: the fixed value to compare against
|
||||
:return: True if the values are equal, False otherwise
|
||||
"""
|
||||
if not field_name in fields:
|
||||
def _greater_than_equal(document_value: Any, filter_value: Any) -> bool:
|
||||
if document_value is None or filter_value is None:
|
||||
# We can't compare None values reliably using operators '>', '>=', '<', '<='
|
||||
return False
|
||||
|
||||
return _safe_eq(fields[field_name], value)
|
||||
return _equal(document_value=document_value, filter_value=filter_value) or _greater_than(
|
||||
document_value=document_value, filter_value=filter_value
|
||||
)
|
||||
|
||||
|
||||
def in_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks for whether the document's field value value is present into the given list.
|
||||
|
||||
:param fields: all the document's field value
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the document's value is included in the given list, False otherwise
|
||||
"""
|
||||
if not field_name in fields:
|
||||
def _less_than(document_value: Any, filter_value: Any) -> bool:
|
||||
if document_value is None or filter_value is None:
|
||||
# We can't compare None values reliably using operators '>', '>=', '<', '<='
|
||||
return False
|
||||
|
||||
if not isinstance(value, IN_TYPES):
|
||||
raise FilterError("$in accepts only iterable values like lists, sets and tuples.")
|
||||
|
||||
return any(_safe_eq(fields[field_name], v) for v in value)
|
||||
return not _greater_than_equal(document_value=document_value, filter_value=filter_value)
|
||||
|
||||
|
||||
def ne_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks for inequality between the document's field value value and a fixed value.
|
||||
|
||||
:param fields: all the document's field value
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the values are different, False otherwise
|
||||
"""
|
||||
return not eq_operation(fields, field_name, value)
|
||||
|
||||
|
||||
def nin_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks whether the document's field value value is absent from the given list.
|
||||
|
||||
:param fields: all the document's field value
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the document's value is not included in the given list, False otherwise
|
||||
"""
|
||||
return not in_operation(fields, field_name, value)
|
||||
|
||||
|
||||
def gt_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks whether the document's field value value is (strictly) larger than the given value.
|
||||
|
||||
:param fields: all the document's field value
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the document's value is strictly larger than the fixed value, False otherwise
|
||||
"""
|
||||
if not field_name in fields:
|
||||
def _less_than_equal(document_value: Any, filter_value: Any) -> bool:
|
||||
if document_value is None or filter_value is None:
|
||||
# We can't compare None values reliably using operators '>', '>=', '<', '<='
|
||||
return False
|
||||
return _safe_gt(fields[field_name], value)
|
||||
|
||||
return not _greater_than(document_value=document_value, filter_value=filter_value)
|
||||
|
||||
|
||||
def gte_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks whether the document's field value value is larger than or equal to the given value.
|
||||
|
||||
:param fields: all the document's field value
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the document's value is larger than or equal to the fixed value, False otherwise
|
||||
"""
|
||||
return gt_operation(fields, field_name, value) or eq_operation(fields, field_name, value)
|
||||
def _in(document_value: Any, filter_value: Any) -> bool:
|
||||
if not isinstance(filter_value, list):
|
||||
msg = (
|
||||
f"Filter value must be a `list` when using operator 'in' or 'not in', received type '{type(filter_value)}'"
|
||||
)
|
||||
raise FilterError(msg)
|
||||
return any(_equal(e, document_value) for e in filter_value)
|
||||
|
||||
|
||||
def lt_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks whether the document's field value value is (strictly) smaller than the given value.
|
||||
|
||||
:param fields: all the document's field value
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the document's value is strictly smaller than the fixed value, False otherwise
|
||||
"""
|
||||
if not field_name in fields:
|
||||
return False
|
||||
return not _safe_gt(fields[field_name], value) and not _safe_eq(fields[field_name], value)
|
||||
def _not_in(document_value: Any, filter_value: Any) -> bool:
|
||||
return not _in(document_value=document_value, filter_value=filter_value)
|
||||
|
||||
|
||||
def lte_operation(fields, field_name, value):
|
||||
"""
|
||||
Checks whether the document's field value value is smaller than or equal to the given value.
|
||||
|
||||
:param fields: all the document's field value
|
||||
:param field_name: the field to test
|
||||
:param value; the fixed value to compare against
|
||||
:return: True if the document's value is smaller than or equal to the fixed value, False otherwise
|
||||
"""
|
||||
if not field_name in fields:
|
||||
return False
|
||||
return not _safe_gt(fields[field_name], value)
|
||||
|
||||
|
||||
LOGICAL_STATEMENTS = {"$not": not_operation, "$and": and_operation, "$or": or_operation}
|
||||
OPERATORS = {
|
||||
"$eq": eq_operation,
|
||||
"$in": in_operation,
|
||||
"$ne": ne_operation,
|
||||
"$nin": nin_operation,
|
||||
"$gt": gt_operation,
|
||||
"$gte": gte_operation,
|
||||
"$lt": lt_operation,
|
||||
"$lte": lte_operation,
|
||||
COMPARISON_OPERATORS = {
|
||||
"==": _equal,
|
||||
"!=": _not_equal,
|
||||
">": _greater_than,
|
||||
">=": _greater_than_equal,
|
||||
"<": _less_than,
|
||||
"<=": _less_than_equal,
|
||||
"in": _in,
|
||||
"not in": _not_in,
|
||||
}
|
||||
RESERVED_KEYS = [*LOGICAL_STATEMENTS.keys(), *OPERATORS.keys()]
|
||||
|
||||
|
||||
def document_matches_filter(conditions: Union[Dict, List], document: Document, _current_key=None):
|
||||
"""
|
||||
Check if a document's metadata matches the provided filter conditions.
|
||||
|
||||
This function evaluates the specified conditions against the metadata of the given document
|
||||
and returns True if the conditions are met, otherwise it returns False.
|
||||
|
||||
:param conditions: A dictionary or list containing filter conditions to be applied to the document's metadata.
|
||||
:param document: The document whose metadata will be evaluated against the conditions.
|
||||
:param _current_key: internal parameter, don't use.
|
||||
:return: True if the document's metadata matches the filter conditions, False otherwise.
|
||||
"""
|
||||
if isinstance(conditions, dict):
|
||||
# Check for malformed filters, like {"name": {"year": "2020"}}
|
||||
if _current_key and any(key not in RESERVED_KEYS for key in conditions.keys()):
|
||||
raise FilterError(
|
||||
f"This filter ({{{_current_key}: {conditions}}}) seems to be malformed. "
|
||||
"Comparisons between dictionaries are not currently supported. "
|
||||
"Check the documentation to learn more about filters syntax."
|
||||
)
|
||||
|
||||
if len(conditions.keys()) > 1:
|
||||
# The default operation for a list of sibling conditions is $and
|
||||
return and_operation(conditions=_list_conditions(conditions), document=document, _current_key=_current_key)
|
||||
|
||||
field_key, field_value = list(conditions.items())[0]
|
||||
|
||||
# Nested logical statement ($and, $or, $not)
|
||||
if field_key in LOGICAL_STATEMENTS.keys():
|
||||
return LOGICAL_STATEMENTS[field_key](
|
||||
conditions=_list_conditions(field_value), document=document, _current_key=_current_key
|
||||
)
|
||||
|
||||
# A comparison operator ($eq, $in, $gte, ...)
|
||||
if field_key in OPERATORS.keys():
|
||||
if not _current_key:
|
||||
raise FilterError(
|
||||
"Filters can't start with an operator like $eq and $in. You have to specify the field name first. "
|
||||
"See the examples in the documentation."
|
||||
)
|
||||
return OPERATORS[field_key](fields=document.to_dict(), field_name=_current_key, value=field_value)
|
||||
|
||||
# Otherwise fall back to the defaults
|
||||
conditions = _list_conditions(field_value)
|
||||
_current_key = field_key
|
||||
|
||||
# Defaults for implicit filters
|
||||
if isinstance(conditions, list):
|
||||
if all(isinstance(cond, dict) for cond in conditions):
|
||||
# The default operation for a list of sibling conditions is $and
|
||||
return and_operation(conditions=_list_conditions(conditions), document=document, _current_key=_current_key)
|
||||
else:
|
||||
# The default operator for a {key: [value1, value2]} filter is $in
|
||||
return in_operation(fields=document.to_dict(), field_name=_current_key, value=conditions)
|
||||
|
||||
if _current_key:
|
||||
# The default operator for a {key: value} filter is $eq
|
||||
return eq_operation(fields=document.to_dict(), field_name=_current_key, value=conditions)
|
||||
|
||||
raise FilterError("Filters must be dictionaries or lists. See the examples in the documentation.")
|
||||
def _logic_condition(condition: Dict[str, Any], document: Document) -> bool:
|
||||
if "operator" not in condition:
|
||||
msg = f"'operator' key missing in {condition}"
|
||||
raise FilterError(msg)
|
||||
if "conditions" not in condition:
|
||||
msg = f"'conditions' key missing in {condition}"
|
||||
raise FilterError(msg)
|
||||
operator: str = condition["operator"]
|
||||
conditions: List[Dict[str, Any]] = condition["conditions"]
|
||||
return LOGICAL_OPERATORS[operator](document, conditions)
|
||||
|
||||
|
||||
def _list_conditions(conditions: Any) -> List[Any]:
|
||||
"""
|
||||
Make sure all nested conditions are not dictionaries or single values, but always lists.
|
||||
def _comparison_condition(condition: Dict[str, Any], document: Document) -> bool:
|
||||
if "field" not in condition:
|
||||
# 'field' key is only found in comparison dictionaries.
|
||||
# We assume this is a logic dictionary since it's not present.
|
||||
return _logic_condition(condition, document)
|
||||
field: str = condition["field"]
|
||||
|
||||
:param conditions: the conditions to transform into a list
|
||||
:returns: a list of filters
|
||||
"""
|
||||
if isinstance(conditions, list):
|
||||
return conditions
|
||||
if isinstance(conditions, dict):
|
||||
return [{key: value} for key, value in conditions.items()]
|
||||
return [conditions]
|
||||
if "operator" not in condition:
|
||||
msg = f"'operator' key missing in {condition}"
|
||||
raise FilterError(msg)
|
||||
if "value" not in condition:
|
||||
msg = f"'value' key missing in {condition}"
|
||||
raise FilterError(msg)
|
||||
|
||||
if "." in field:
|
||||
# Handles fields formatted like so:
|
||||
# 'meta.person.name'
|
||||
parts = field.split(".")
|
||||
document_value = getattr(document, parts[0])
|
||||
for part in parts[1:]:
|
||||
if part not in document_value:
|
||||
# If a field is not found we treat it as None
|
||||
document_value = None
|
||||
break
|
||||
document_value = document_value[part]
|
||||
elif field not in [f.name for f in fields(document)]:
|
||||
# Converted legacy filters don't add the `meta.` prefix, so we assume
|
||||
# that all filter fields that are not actual fields in Document are converted
|
||||
# filters.
|
||||
#
|
||||
# We handle this to avoid breaking compatibility with converted legacy filters.
|
||||
# This will be removed as soon as we stop supporting legacy filters.
|
||||
document_value = document.meta.get(field)
|
||||
else:
|
||||
document_value = getattr(document, field)
|
||||
operator: str = condition["operator"]
|
||||
filter_value: Any = condition["value"]
|
||||
return COMPARISON_OPERATORS[operator](filter_value=filter_value, document_value=document_value)
|
||||
|
||||
|
||||
def convert(filters: Dict[str, Any]) -> Dict[str, Any]:
|
||||
|
||||
87
releasenotes/notes/rework-filters-1bb103d196a1912b.yaml
Normal file
87
releasenotes/notes/rework-filters-1bb103d196a1912b.yaml
Normal file
@ -0,0 +1,87 @@
|
||||
---
|
||||
prelude: >
|
||||
With proposal [#6001](https://github.com/deepset-ai/haystack/pull/6001) we introduced a better specification to declare filters in Haystack 2.x.
|
||||
The new syntax is a bit more verbose but less confusing and ambiguous as there are no implicit operators.
|
||||
This will simplify conversion from this common syntax to a Document Store specific filtering logic, so it will ease
|
||||
development of new Document Store.
|
||||
Since everything must be declared explicitly it will also make it easier for user to understand the filters just
|
||||
by reading them.
|
||||
|
||||
The full specification is as follow.
|
||||
|
||||
---
|
||||
|
||||
Filters top level must be a dictionary.
|
||||
|
||||
There are two types of dictionaries:
|
||||
|
||||
- Comparison
|
||||
- Logic
|
||||
|
||||
Top level can be either be a Comparison or Logic dictionary.
|
||||
|
||||
Comparison dictionaries must contain the keys:
|
||||
|
||||
- `field`
|
||||
- `operator`
|
||||
- `value`
|
||||
|
||||
Logic dictionaries must contain the keys:
|
||||
|
||||
- `operator`
|
||||
- `conditions`
|
||||
|
||||
`conditions` key must be a list of dictionaries, either Comparison or Logic.
|
||||
|
||||
`operator` values in Comparison dictionaries must be:
|
||||
|
||||
- `==`
|
||||
- `!=`
|
||||
- `>`
|
||||
- `>=`
|
||||
- `<`
|
||||
- `<=`
|
||||
- `in`
|
||||
- `not in`
|
||||
|
||||
`operator` values in Logic dictionaries must be:
|
||||
|
||||
- `NOT`
|
||||
- `OR`
|
||||
- `AND`
|
||||
|
||||
---
|
||||
|
||||
A simple filter:
|
||||
|
||||
```python
|
||||
filters = {"field": "meta.type", "operator": "==", "value": "article"}
|
||||
```
|
||||
|
||||
A more complex filter:
|
||||
```python
|
||||
filters = {
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "meta.type", "operator": "==", "value": "article"},
|
||||
{"field": "meta.date", "operator": ">=", "value": 1420066800},
|
||||
{"field": "meta.date", "operator": "<", "value": 1609455600},
|
||||
{"field": "meta.rating", "operator": ">=", "value": 3},
|
||||
{
|
||||
"operator": "OR",
|
||||
"conditions": [
|
||||
{"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]},
|
||||
{"field": "meta.publisher", "operator": "==", "value": "nytimes"},
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
To avoid causing too much disruption for users using legacy filters we'll keep supporting them for the time being.
|
||||
We also provide a utility `convert` function for developers implementing their Document Store to do the same.
|
||||
preview:
|
||||
- |
|
||||
Refactored `InMemoryDocumentStore` and `MetadataRouter` filtering logic to support new filters declaration.
|
||||
@ -8,8 +8,20 @@ class TestMetadataRouter:
|
||||
@pytest.mark.unit
|
||||
def test_run(self):
|
||||
rules = {
|
||||
"edge_1": {"created_at": {"$gte": "2023-01-01", "$lt": "2023-04-01"}},
|
||||
"edge_2": {"created_at": {"$gte": "2023-04-01", "$lt": "2023-07-01"}},
|
||||
"edge_1": {
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"},
|
||||
{"field": "meta.created_at", "operator": "<", "value": "2023-04-01"},
|
||||
],
|
||||
},
|
||||
"edge_2": {
|
||||
"operator": "AND",
|
||||
"conditions": [
|
||||
{"field": "meta.created_at", "operator": ">=", "value": "2023-04-01"},
|
||||
{"field": "meta.created_at", "operator": "<", "value": "2023-07-01"},
|
||||
],
|
||||
},
|
||||
}
|
||||
router = MetadataRouter(rules=rules)
|
||||
documents = [
|
||||
|
||||
@ -146,10 +146,6 @@ class TestMemoryDocumentStore(DocumentStoreBaseTests): # pylint: disable=R0904
|
||||
results = document_store.bm25_retrieval(query="Python", top_k=1)
|
||||
assert results[0].content == "Python is a popular programming language"
|
||||
|
||||
@pytest.mark.skip(reason="Filter is not working properly, see https://github.com/deepset-ai/haystack/issues/6153")
|
||||
def test_eq_filter_embedding(self, document_store: InMemoryDocumentStore, filterable_docs):
|
||||
pass
|
||||
|
||||
# Test a query, add a new document and make sure results are appropriately updated
|
||||
@pytest.mark.unit
|
||||
def test_bm25_retrieval_with_updated_docs(self, document_store: InMemoryDocumentStore):
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user