refactor: Move filter utilities (2.0) (#5797)

* Move filter utilities

* PR feedback
This commit is contained in:
bogdankostic 2023-09-14 13:23:53 +02:00 committed by GitHub
parent ad5b615503
commit 1a212420b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 33 additions and 38 deletions

View File

@ -2,10 +2,6 @@ class DocumentStoreError(Exception):
pass pass
class FilterError(DocumentStoreError):
pass
class DuplicateDocumentError(DocumentStoreError): class DuplicateDocumentError(DocumentStoreError):
pass pass

View File

@ -1,4 +1,3 @@
from haystack.preview.document_stores.memory.document_store import MemoryDocumentStore from haystack.preview.document_stores.memory.document_store import MemoryDocumentStore
from haystack.preview.document_stores.memory.errors import MemoryDocumentStoreFilterError
__all__ = ["MemoryDocumentStore", "MemoryDocumentStoreFilterError"] __all__ = ["MemoryDocumentStore"]

View File

@ -11,7 +11,7 @@ from haystack.preview import default_from_dict, default_to_dict
from haystack.preview.document_stores.decorator import document_store from haystack.preview.document_stores.decorator import document_store
from haystack.preview.dataclasses import Document from haystack.preview.dataclasses import Document
from haystack.preview.document_stores.protocols import DuplicatePolicy, DocumentStore from haystack.preview.document_stores.protocols import DuplicatePolicy, DocumentStore
from haystack.preview.document_stores.memory._filters import match from haystack.preview.utils.filters import document_matches_filter
from haystack.preview.document_stores.errors import DuplicateDocumentError, MissingDocumentError, DocumentStoreError from haystack.preview.document_stores.errors import DuplicateDocumentError, MissingDocumentError, DocumentStoreError
from haystack.preview.utils import expit from haystack.preview.utils import expit
@ -160,7 +160,7 @@ class MemoryDocumentStore:
:return: A list of Documents that match the given filters. :return: A list of Documents that match the given filters.
""" """
if filters: if filters:
return [doc for doc in self.storage.values() if match(conditions=filters, document=doc)] return [doc for doc in self.storage.values() if document_matches_filter(conditions=filters, document=doc)]
return list(self.storage.values()) return list(self.storage.values())
def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> None: def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> None:

View File

@ -1,5 +0,0 @@
from haystack.preview.document_stores.errors import FilterError
class MemoryDocumentStoreFilterError(FilterError):
pass

View File

@ -0,0 +1,2 @@
class FilterError(Exception):
pass

View File

@ -7,7 +7,8 @@ import pandas as pd
from haystack.preview.dataclasses import Document from haystack.preview.dataclasses import Document
from haystack.preview.document_stores import DocumentStore, DuplicatePolicy from haystack.preview.document_stores import DocumentStore, DuplicatePolicy
from haystack.preview.document_stores.errors import FilterError, MissingDocumentError, DuplicateDocumentError from haystack.preview.document_stores.errors import MissingDocumentError, DuplicateDocumentError
from haystack.preview.errors import FilterError
class DocumentStoreBaseTests: class DocumentStoreBaseTests:

View File

@ -1,2 +1,3 @@
from haystack.preview.utils.expit import expit from haystack.preview.utils.expit import expit
from haystack.preview.utils.requests_utils import request_with_retry from haystack.preview.utils.requests_utils import request_with_retry
from haystack.preview.utils.filters import document_matches_filter

View File

@ -1,10 +1,10 @@
from typing import List, Any from typing import List, Any, Union, Dict
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from haystack.preview.dataclasses import Document from haystack.preview.dataclasses import Document
from haystack.preview.document_stores.memory.errors import MemoryDocumentStoreFilterError from haystack.preview.errors import FilterError
GT_TYPES = (int, float, np.number) GT_TYPES = (int, float, np.number)
@ -32,10 +32,10 @@ def and_operation(conditions: List[Any], document: Document, _current_key: str):
:param _current_key: internal, don't use. :param _current_key: internal, don't use.
:return: True if the document matches all the filters, False otherwise :return: True if the document matches all the filters, False otherwise
""" """
for condition in conditions: return all(
if not match(conditions=condition, document=document, _current_key=_current_key): document_matches_filter(conditions=condition, document=document, _current_key=_current_key)
return False for condition in conditions
return True )
def or_operation(conditions: List[Any], document: Document, _current_key: str): def or_operation(conditions: List[Any], document: Document, _current_key: str):
@ -45,12 +45,12 @@ def or_operation(conditions: List[Any], document: Document, _current_key: str):
:param conditions: the filters dictionary. :param conditions: the filters dictionary.
:param document: the document to test. :param document: the document to test.
:param _current_key: internal, don't use. :param _current_key: internal, don't use.
:return: True if the document matches ano of the filters, False otherwise :return: True if the document matches any of the filters, False otherwise
""" """
for condition in conditions: return any(
if match(conditions=condition, document=document, _current_key=_current_key): document_matches_filter(conditions=condition, document=document, _current_key=_current_key)
return True for condition in conditions
return False )
def _safe_eq(first: Any, second: Any) -> bool: def _safe_eq(first: Any, second: Any) -> bool:
@ -76,7 +76,7 @@ def _safe_gt(first: Any, second: Any) -> bool:
Works only for numerical values and dates. Strings, lists, tables and tensors all raise exceptions. Works only for numerical values and dates. Strings, lists, tables and tensors all raise exceptions.
""" """
if not isinstance(first, GT_TYPES) or not isinstance(second, GT_TYPES): if not isinstance(first, GT_TYPES) or not isinstance(second, GT_TYPES):
raise MemoryDocumentStoreFilterError( raise FilterError(
f"Can't evaluate '{type(first).__name__} > {type(second).__name__}'. " f"Can't evaluate '{type(first).__name__} > {type(second).__name__}'. "
f"Convert these values into one of the following types: {[type_.__name__ for type_ in GT_TYPES]}" f"Convert these values into one of the following types: {[type_.__name__ for type_ in GT_TYPES]}"
) )
@ -111,7 +111,7 @@ def in_operation(fields, field_name, value):
return False return False
if not isinstance(value, IN_TYPES): if not isinstance(value, IN_TYPES):
raise MemoryDocumentStoreFilterError("$in accepts only iterable values like lists, sets and tuples.") raise FilterError("$in accepts only iterable values like lists, sets and tuples.")
return any(_safe_eq(fields[field_name], v) for v in value) return any(_safe_eq(fields[field_name], v) for v in value)
@ -208,19 +208,22 @@ OPERATORS = {
RESERVED_KEYS = [*LOGICAL_STATEMENTS.keys(), *OPERATORS.keys()] RESERVED_KEYS = [*LOGICAL_STATEMENTS.keys(), *OPERATORS.keys()]
def match(conditions: Any, document: Document, _current_key=None): def document_matches_filter(conditions: Union[Dict, List], document: Document, _current_key=None):
""" """
This method applies the filters to any given document and returns True when the documents Check if a document's metadata matches the provided filter conditions.
metadata matches the filters, False otherwise.
:param conditions: the filters dictionary. This function evaluates the specified conditions against the metadata of the given document
:param document: the document to test. and returns True if the conditions are met, otherwise it returns False.
:return: True if the document matches the filters, False otherwise
:param conditions: A dictionary or list containing filter conditions to be applied to the document's metadata.
:param document: The document whose metadata will be evaluated against the conditions.
:param _current_key: internal parameter, don't use.
:return: True if the document's metadata matches the filter conditions, False otherwise.
""" """
if isinstance(conditions, dict): if isinstance(conditions, dict):
# Check for malformed filters, like {"name": {"year": "2020"}} # Check for malformed filters, like {"name": {"year": "2020"}}
if _current_key and any(key not in RESERVED_KEYS for key in conditions.keys()): if _current_key and any(key not in RESERVED_KEYS for key in conditions.keys()):
raise MemoryDocumentStoreFilterError( raise FilterError(
f"This filter ({{{_current_key}: {conditions}}}) seems to be malformed. " f"This filter ({{{_current_key}: {conditions}}}) seems to be malformed. "
"Comparisons between dictionaries are not currently supported. " "Comparisons between dictionaries are not currently supported. "
"Check the documentation to learn more about filters syntax." "Check the documentation to learn more about filters syntax."
@ -241,7 +244,7 @@ def match(conditions: Any, document: Document, _current_key=None):
# A comparison operator ($eq, $in, $gte, ...) # A comparison operator ($eq, $in, $gte, ...)
if field_key in OPERATORS.keys(): if field_key in OPERATORS.keys():
if not _current_key: if not _current_key:
raise MemoryDocumentStoreFilterError( raise FilterError(
"Filters can't start with an operator like $eq and $in. You have to specify the field name first. " "Filters can't start with an operator like $eq and $in. You have to specify the field name first. "
"See the examples in the documentation." "See the examples in the documentation."
) )
@ -264,9 +267,7 @@ def match(conditions: Any, document: Document, _current_key=None):
# The default operator for a {key: value} filter is $eq # The default operator for a {key: value} filter is $eq
return eq_operation(fields=document.flatten(), field_name=_current_key, value=conditions) return eq_operation(fields=document.flatten(), field_name=_current_key, value=conditions)
raise MemoryDocumentStoreFilterError( raise FilterError("Filters must be dictionaries or lists. See the examples in the documentation.")
"Filters must be dictionaries or lists. See the examples in the documentation."
)
def _list_conditions(conditions: Any) -> List[Any]: def _list_conditions(conditions: Any) -> List[Any]: