From 1a212420b70d1df02edd2d1fbfe156acf2f455ce Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Thu, 14 Sep 2023 13:23:53 +0200 Subject: [PATCH] refactor: Move filter utilities (2.0) (#5797) * Move filter utilities * PR feedback --- haystack/preview/document_stores/errors.py | 4 -- .../document_stores/memory/__init__.py | 3 +- .../document_stores/memory/document_store.py | 4 +- .../preview/document_stores/memory/errors.py | 5 -- haystack/preview/errors.py | 2 + haystack/preview/testing/document_store.py | 3 +- haystack/preview/utils/__init__.py | 1 + .../memory/_filters.py => utils/filters.py} | 49 ++++++++++--------- 8 files changed, 33 insertions(+), 38 deletions(-) delete mode 100644 haystack/preview/document_stores/memory/errors.py create mode 100644 haystack/preview/errors.py rename haystack/preview/{document_stores/memory/_filters.py => utils/filters.py} (86%) diff --git a/haystack/preview/document_stores/errors.py b/haystack/preview/document_stores/errors.py index 85830be5c..c345b04e5 100644 --- a/haystack/preview/document_stores/errors.py +++ b/haystack/preview/document_stores/errors.py @@ -2,10 +2,6 @@ class DocumentStoreError(Exception): pass -class FilterError(DocumentStoreError): - pass - - class DuplicateDocumentError(DocumentStoreError): pass diff --git a/haystack/preview/document_stores/memory/__init__.py b/haystack/preview/document_stores/memory/__init__.py index e09a85c78..16a8dc380 100644 --- a/haystack/preview/document_stores/memory/__init__.py +++ b/haystack/preview/document_stores/memory/__init__.py @@ -1,4 +1,3 @@ from haystack.preview.document_stores.memory.document_store import MemoryDocumentStore -from haystack.preview.document_stores.memory.errors import MemoryDocumentStoreFilterError -__all__ = ["MemoryDocumentStore", "MemoryDocumentStoreFilterError"] +__all__ = ["MemoryDocumentStore"] diff --git a/haystack/preview/document_stores/memory/document_store.py b/haystack/preview/document_stores/memory/document_store.py index 61679646f..917f35549 100644 --- a/haystack/preview/document_stores/memory/document_store.py +++ b/haystack/preview/document_stores/memory/document_store.py @@ -11,7 +11,7 @@ from haystack.preview import default_from_dict, default_to_dict from haystack.preview.document_stores.decorator import document_store from haystack.preview.dataclasses import Document from haystack.preview.document_stores.protocols import DuplicatePolicy, DocumentStore -from haystack.preview.document_stores.memory._filters import match +from haystack.preview.utils.filters import document_matches_filter from haystack.preview.document_stores.errors import DuplicateDocumentError, MissingDocumentError, DocumentStoreError from haystack.preview.utils import expit @@ -160,7 +160,7 @@ class MemoryDocumentStore: :return: A list of Documents that match the given filters. """ if filters: - return [doc for doc in self.storage.values() if match(conditions=filters, document=doc)] + return [doc for doc in self.storage.values() if document_matches_filter(conditions=filters, document=doc)] return list(self.storage.values()) def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> None: diff --git a/haystack/preview/document_stores/memory/errors.py b/haystack/preview/document_stores/memory/errors.py deleted file mode 100644 index c1c0b5948..000000000 --- a/haystack/preview/document_stores/memory/errors.py +++ /dev/null @@ -1,5 +0,0 @@ -from haystack.preview.document_stores.errors import FilterError - - -class MemoryDocumentStoreFilterError(FilterError): - pass diff --git a/haystack/preview/errors.py b/haystack/preview/errors.py new file mode 100644 index 000000000..c7a6c47d6 --- /dev/null +++ b/haystack/preview/errors.py @@ -0,0 +1,2 @@ +class FilterError(Exception): + pass diff --git a/haystack/preview/testing/document_store.py b/haystack/preview/testing/document_store.py index 991b1f842..9127fdab1 100644 --- a/haystack/preview/testing/document_store.py +++ b/haystack/preview/testing/document_store.py @@ -7,7 +7,8 @@ import pandas as pd from haystack.preview.dataclasses import Document from haystack.preview.document_stores import DocumentStore, DuplicatePolicy -from haystack.preview.document_stores.errors import FilterError, MissingDocumentError, DuplicateDocumentError +from haystack.preview.document_stores.errors import MissingDocumentError, DuplicateDocumentError +from haystack.preview.errors import FilterError class DocumentStoreBaseTests: diff --git a/haystack/preview/utils/__init__.py b/haystack/preview/utils/__init__.py index adf41e033..a84ea468e 100644 --- a/haystack/preview/utils/__init__.py +++ b/haystack/preview/utils/__init__.py @@ -1,2 +1,3 @@ from haystack.preview.utils.expit import expit from haystack.preview.utils.requests_utils import request_with_retry +from haystack.preview.utils.filters import document_matches_filter diff --git a/haystack/preview/document_stores/memory/_filters.py b/haystack/preview/utils/filters.py similarity index 86% rename from haystack/preview/document_stores/memory/_filters.py rename to haystack/preview/utils/filters.py index 69b35b758..346e6760f 100644 --- a/haystack/preview/document_stores/memory/_filters.py +++ b/haystack/preview/utils/filters.py @@ -1,10 +1,10 @@ -from typing import List, Any +from typing import List, Any, Union, Dict import numpy as np import pandas as pd from haystack.preview.dataclasses import Document -from haystack.preview.document_stores.memory.errors import MemoryDocumentStoreFilterError +from haystack.preview.errors import FilterError GT_TYPES = (int, float, np.number) @@ -32,10 +32,10 @@ def and_operation(conditions: List[Any], document: Document, _current_key: str): :param _current_key: internal, don't use. :return: True if the document matches all the filters, False otherwise """ - for condition in conditions: - if not match(conditions=condition, document=document, _current_key=_current_key): - return False - return True + return all( + document_matches_filter(conditions=condition, document=document, _current_key=_current_key) + for condition in conditions + ) def or_operation(conditions: List[Any], document: Document, _current_key: str): @@ -45,12 +45,12 @@ def or_operation(conditions: List[Any], document: Document, _current_key: str): :param conditions: the filters dictionary. :param document: the document to test. :param _current_key: internal, don't use. - :return: True if the document matches ano of the filters, False otherwise + :return: True if the document matches any of the filters, False otherwise """ - for condition in conditions: - if match(conditions=condition, document=document, _current_key=_current_key): - return True - return False + return any( + document_matches_filter(conditions=condition, document=document, _current_key=_current_key) + for condition in conditions + ) def _safe_eq(first: Any, second: Any) -> bool: @@ -76,7 +76,7 @@ def _safe_gt(first: Any, second: Any) -> bool: Works only for numerical values and dates. Strings, lists, tables and tensors all raise exceptions. """ if not isinstance(first, GT_TYPES) or not isinstance(second, GT_TYPES): - raise MemoryDocumentStoreFilterError( + raise FilterError( f"Can't evaluate '{type(first).__name__} > {type(second).__name__}'. " f"Convert these values into one of the following types: {[type_.__name__ for type_ in GT_TYPES]}" ) @@ -111,7 +111,7 @@ def in_operation(fields, field_name, value): return False if not isinstance(value, IN_TYPES): - raise MemoryDocumentStoreFilterError("$in accepts only iterable values like lists, sets and tuples.") + raise FilterError("$in accepts only iterable values like lists, sets and tuples.") return any(_safe_eq(fields[field_name], v) for v in value) @@ -208,19 +208,22 @@ OPERATORS = { RESERVED_KEYS = [*LOGICAL_STATEMENTS.keys(), *OPERATORS.keys()] -def match(conditions: Any, document: Document, _current_key=None): +def document_matches_filter(conditions: Union[Dict, List], document: Document, _current_key=None): """ - This method applies the filters to any given document and returns True when the documents - metadata matches the filters, False otherwise. + Check if a document's metadata matches the provided filter conditions. - :param conditions: the filters dictionary. - :param document: the document to test. - :return: True if the document matches the filters, False otherwise + This function evaluates the specified conditions against the metadata of the given document + and returns True if the conditions are met, otherwise it returns False. + + :param conditions: A dictionary or list containing filter conditions to be applied to the document's metadata. + :param document: The document whose metadata will be evaluated against the conditions. + :param _current_key: internal parameter, don't use. + :return: True if the document's metadata matches the filter conditions, False otherwise. """ if isinstance(conditions, dict): # Check for malformed filters, like {"name": {"year": "2020"}} if _current_key and any(key not in RESERVED_KEYS for key in conditions.keys()): - raise MemoryDocumentStoreFilterError( + raise FilterError( f"This filter ({{{_current_key}: {conditions}}}) seems to be malformed. " "Comparisons between dictionaries are not currently supported. " "Check the documentation to learn more about filters syntax." @@ -241,7 +244,7 @@ def match(conditions: Any, document: Document, _current_key=None): # A comparison operator ($eq, $in, $gte, ...) if field_key in OPERATORS.keys(): if not _current_key: - raise MemoryDocumentStoreFilterError( + raise FilterError( "Filters can't start with an operator like $eq and $in. You have to specify the field name first. " "See the examples in the documentation." ) @@ -264,9 +267,7 @@ def match(conditions: Any, document: Document, _current_key=None): # The default operator for a {key: value} filter is $eq return eq_operation(fields=document.flatten(), field_name=_current_key, value=conditions) - raise MemoryDocumentStoreFilterError( - "Filters must be dictionaries or lists. See the examples in the documentation." - ) + raise FilterError("Filters must be dictionaries or lists. See the examples in the documentation.") def _list_conditions(conditions: Any) -> List[Any]: