refactor: Move filter utilities (2.0) (#5797)

* Move filter utilities

* PR feedback
This commit is contained in:
bogdankostic 2023-09-14 13:23:53 +02:00 committed by GitHub
parent ad5b615503
commit 1a212420b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 33 additions and 38 deletions

View File

@ -2,10 +2,6 @@ class DocumentStoreError(Exception):
pass
class FilterError(DocumentStoreError):
pass
class DuplicateDocumentError(DocumentStoreError):
pass

View File

@ -1,4 +1,3 @@
from haystack.preview.document_stores.memory.document_store import MemoryDocumentStore
from haystack.preview.document_stores.memory.errors import MemoryDocumentStoreFilterError
__all__ = ["MemoryDocumentStore", "MemoryDocumentStoreFilterError"]
__all__ = ["MemoryDocumentStore"]

View File

@ -11,7 +11,7 @@ from haystack.preview import default_from_dict, default_to_dict
from haystack.preview.document_stores.decorator import document_store
from haystack.preview.dataclasses import Document
from haystack.preview.document_stores.protocols import DuplicatePolicy, DocumentStore
from haystack.preview.document_stores.memory._filters import match
from haystack.preview.utils.filters import document_matches_filter
from haystack.preview.document_stores.errors import DuplicateDocumentError, MissingDocumentError, DocumentStoreError
from haystack.preview.utils import expit
@ -160,7 +160,7 @@ class MemoryDocumentStore:
:return: A list of Documents that match the given filters.
"""
if filters:
return [doc for doc in self.storage.values() if match(conditions=filters, document=doc)]
return [doc for doc in self.storage.values() if document_matches_filter(conditions=filters, document=doc)]
return list(self.storage.values())
def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> None:

View File

@ -1,5 +0,0 @@
from haystack.preview.document_stores.errors import FilterError
class MemoryDocumentStoreFilterError(FilterError):
pass

View File

@ -0,0 +1,2 @@
class FilterError(Exception):
pass

View File

@ -7,7 +7,8 @@ import pandas as pd
from haystack.preview.dataclasses import Document
from haystack.preview.document_stores import DocumentStore, DuplicatePolicy
from haystack.preview.document_stores.errors import FilterError, MissingDocumentError, DuplicateDocumentError
from haystack.preview.document_stores.errors import MissingDocumentError, DuplicateDocumentError
from haystack.preview.errors import FilterError
class DocumentStoreBaseTests:

View File

@ -1,2 +1,3 @@
from haystack.preview.utils.expit import expit
from haystack.preview.utils.requests_utils import request_with_retry
from haystack.preview.utils.filters import document_matches_filter

View File

@ -1,10 +1,10 @@
from typing import List, Any
from typing import List, Any, Union, Dict
import numpy as np
import pandas as pd
from haystack.preview.dataclasses import Document
from haystack.preview.document_stores.memory.errors import MemoryDocumentStoreFilterError
from haystack.preview.errors import FilterError
GT_TYPES = (int, float, np.number)
@ -32,10 +32,10 @@ def and_operation(conditions: List[Any], document: Document, _current_key: str):
:param _current_key: internal, don't use.
:return: True if the document matches all the filters, False otherwise
"""
for condition in conditions:
if not match(conditions=condition, document=document, _current_key=_current_key):
return False
return True
return all(
document_matches_filter(conditions=condition, document=document, _current_key=_current_key)
for condition in conditions
)
def or_operation(conditions: List[Any], document: Document, _current_key: str):
@ -45,12 +45,12 @@ def or_operation(conditions: List[Any], document: Document, _current_key: str):
:param conditions: the filters dictionary.
:param document: the document to test.
:param _current_key: internal, don't use.
:return: True if the document matches ano of the filters, False otherwise
:return: True if the document matches any of the filters, False otherwise
"""
for condition in conditions:
if match(conditions=condition, document=document, _current_key=_current_key):
return True
return False
return any(
document_matches_filter(conditions=condition, document=document, _current_key=_current_key)
for condition in conditions
)
def _safe_eq(first: Any, second: Any) -> bool:
@ -76,7 +76,7 @@ def _safe_gt(first: Any, second: Any) -> bool:
Works only for numerical values and dates. Strings, lists, tables and tensors all raise exceptions.
"""
if not isinstance(first, GT_TYPES) or not isinstance(second, GT_TYPES):
raise MemoryDocumentStoreFilterError(
raise FilterError(
f"Can't evaluate '{type(first).__name__} > {type(second).__name__}'. "
f"Convert these values into one of the following types: {[type_.__name__ for type_ in GT_TYPES]}"
)
@ -111,7 +111,7 @@ def in_operation(fields, field_name, value):
return False
if not isinstance(value, IN_TYPES):
raise MemoryDocumentStoreFilterError("$in accepts only iterable values like lists, sets and tuples.")
raise FilterError("$in accepts only iterable values like lists, sets and tuples.")
return any(_safe_eq(fields[field_name], v) for v in value)
@ -208,19 +208,22 @@ OPERATORS = {
RESERVED_KEYS = [*LOGICAL_STATEMENTS.keys(), *OPERATORS.keys()]
def match(conditions: Any, document: Document, _current_key=None):
def document_matches_filter(conditions: Union[Dict, List], document: Document, _current_key=None):
"""
This method applies the filters to any given document and returns True when the documents
metadata matches the filters, False otherwise.
Check if a document's metadata matches the provided filter conditions.
:param conditions: the filters dictionary.
:param document: the document to test.
:return: True if the document matches the filters, False otherwise
This function evaluates the specified conditions against the metadata of the given document
and returns True if the conditions are met, otherwise it returns False.
:param conditions: A dictionary or list containing filter conditions to be applied to the document's metadata.
:param document: The document whose metadata will be evaluated against the conditions.
:param _current_key: internal parameter, don't use.
:return: True if the document's metadata matches the filter conditions, False otherwise.
"""
if isinstance(conditions, dict):
# Check for malformed filters, like {"name": {"year": "2020"}}
if _current_key and any(key not in RESERVED_KEYS for key in conditions.keys()):
raise MemoryDocumentStoreFilterError(
raise FilterError(
f"This filter ({{{_current_key}: {conditions}}}) seems to be malformed. "
"Comparisons between dictionaries are not currently supported. "
"Check the documentation to learn more about filters syntax."
@ -241,7 +244,7 @@ def match(conditions: Any, document: Document, _current_key=None):
# A comparison operator ($eq, $in, $gte, ...)
if field_key in OPERATORS.keys():
if not _current_key:
raise MemoryDocumentStoreFilterError(
raise FilterError(
"Filters can't start with an operator like $eq and $in. You have to specify the field name first. "
"See the examples in the documentation."
)
@ -264,9 +267,7 @@ def match(conditions: Any, document: Document, _current_key=None):
# The default operator for a {key: value} filter is $eq
return eq_operation(fields=document.flatten(), field_name=_current_key, value=conditions)
raise MemoryDocumentStoreFilterError(
"Filters must be dictionaries or lists. See the examples in the documentation."
)
raise FilterError("Filters must be dictionaries or lists. See the examples in the documentation.")
def _list_conditions(conditions: Any) -> List[Any]: