Fixing broken BM25 support with Weaviate - fixes #3720 (#3723)

* Fixing broken BM25 support with Weaviate - fixes #3720 Unfortunately the BM25 support with Weaviate got broken with Haystack v1.11.0+, which is getting fixed with this commit. Please see more under issue #3720. * Fixing mypy issue - method signature wasn't matching the base class * Mypy related test fix Mypy forced me to set the signature of the `query` method of the Weaviate document store to the same as its parent, the `KeywordDocumentStore`, where the `query` parame is `Optional`, but has NO default value, so it must be provided (as None) at runtime. I am not quite sure why the abstract method's `query` param was set without a default value while its type is `Optional`, but I didn't want to change that, so instead I have changed the Weaviate tests. * Adding a note regarding an upcomming fix in Weaviate v1.17.0 * Apply suggestions from code review * revert * [EMPTY] Re-trigger CI
2025-12-27 15:08:43 +00:00 · 2022-12-19 11:24:46 -05:00 · 2022-12-19 11:24:46 -05:00 · e143f7cc36
commit e143f7cc36
parent 56803e5465
3 changed files with 165 additions and 12 deletions
--- a/haystack/document_stores/weaviate.py
+++ b/haystack/document_stores/weaviate.py
@ -18,11 +18,11 @@ except (ImportError, ModuleNotFoundError) as ie:
    _optional_component_not_installed(__name__, "weaviate", ie)

 from haystack.schema import Document, FilterType, Label
-from haystack.document_stores import BaseDocumentStore
+from haystack.document_stores import KeywordDocumentStore
 from haystack.document_stores.base import get_batches_from_generator
 from haystack.document_stores.filter_utils import LogicalFilterClause
 from haystack.document_stores.utils import convert_date_to_rfc3339
-from haystack.errors import DocumentStoreError
+from haystack.errors import DocumentStoreError, HaystackError
 from haystack.nodes.retriever import DenseRetriever


@ -34,7 +34,7 @@ class WeaviateDocumentStoreError(DocumentStoreError):
    pass


-class WeaviateDocumentStore(BaseDocumentStore):
+class WeaviateDocumentStore(KeywordDocumentStore):
    """

    Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models.
@ -849,13 +849,13 @@ class WeaviateDocumentStore(BaseDocumentStore):

    def query(
        self,
-        query: Optional[str] = None,
+        query: Optional[str],
        filters: Optional[FilterType] = None,
        top_k: int = 10,
-        all_terms_must_match: bool = False,
        custom_query: Optional[str] = None,
        index: Optional[str] = None,
        headers: Optional[Dict[str, str]] = None,
+        all_terms_must_match: bool = False,
        scale_score: bool = True,
    ) -> List[Document]:
        """
@ -1034,6 +1034,134 @@ class WeaviateDocumentStore(BaseDocumentStore):

        return documents

+    def query_batch(
+        self,
+        queries: List[str],
+        filters: Optional[Union[FilterType, List[Optional[FilterType]]]] = None,
+        top_k: int = 10,
+        custom_query: Optional[str] = None,
+        index: Optional[str] = None,
+        headers: Optional[Dict[str, str]] = None,
+        all_terms_must_match: bool = False,
+        scale_score: bool = True,
+    ) -> List[List[Document]]:
+        """
+        Scan through documents in DocumentStore and return a small number documents
+        that are most relevant to the provided queries as defined by keyword matching algorithms like BM25.
+
+        This method lets you find relevant documents for a single query string (output: List of Documents), or a
+        a list of query strings (output: List of Lists of Documents).
+
+        :param queries: Single query or list of queries.
+        :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+                        conditions.
+                        Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+                        operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+                        `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+                        Logical operator keys take a dictionary of metadata field names and/or logical operators as
+                        value. Metadata field names take a dictionary of comparison operators as value. Comparison
+                        operator keys take a single value or (in case of `"$in"`) a list of values as value.
+                        If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+                        operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+                        operation.
+
+                            __Example__:
+
+                            ```python
+                            filters = {
+                                "$and": {
+                                    "type": {"$eq": "article"},
+                                    "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+                                    "rating": {"$gte": 3},
+                                    "$or": {
+                                        "genre": {"$in": ["economy", "politics"]},
+                                        "publisher": {"$eq": "nytimes"}
+                                    }
+                                }
+                            }
+                            # or simpler using default operators
+                            filters = {
+                                "type": "article",
+                                "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+                                "rating": {"$gte": 3},
+                                "$or": {
+                                    "genre": ["economy", "politics"],
+                                    "publisher": "nytimes"
+                                }
+                            }
+                            ```
+
+                            To use the same logical operator multiple times on the same level, logical operators take
+                            optionally a list of dictionaries as value.
+
+                            __Example__:
+
+                            ```python
+                            filters = {
+                                "$or": [
+                                    {
+                                        "$and": {
+                                            "Type": "News Paper",
+                                            "Date": {
+                                                "$lt": "2019-01-01"
+                                            }
+                                        }
+                                    },
+                                    {
+                                        "$and": {
+                                            "Type": "Blog Post",
+                                            "Date": {
+                                                "$gte": "2019-01-01"
+                                            }
+                                        }
+                                    }
+                                ]
+                            }
+                            ```
+
+        :param top_k: How many documents to return per query.
+        :param custom_query: Custom query to be executed.
+        :param index: The name of the index in the DocumentStore from which to retrieve documents
+        :param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+        :param all_terms_must_match: Whether all terms of the query must match the document.
+                                     If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+                                     Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+                                     Defaults to False.
+        :param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]).
+                            If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
+                            Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
+        """
+        # TODO - This method currently just calls query multiple times. Adapt this once there is a batch querying
+        # endpoint in Weaviate, which is currently not available,
+        # see https://stackoverflow.com/questions/71558676/does-weaviate-support-bulk-query#comment126569547_71561939
+
+        documents = []
+
+        if isinstance(filters, list):
+            if len(filters) != len(queries):
+                raise HaystackError(
+                    "Number of filters does not match number of queries. Please provide as many filters"
+                    " as queries or a single filter that will be applied to each query."
+                )
+        else:
+            filters = [filters] * len(queries) if filters is not None else [{}] * len(queries)
+
+        # run each query against Weaviate separately and combine the returned documents
+        for query, cur_filters in zip(queries, filters):
+            cur_docs = self.query(
+                query=query,
+                filters=cur_filters,
+                top_k=top_k,
+                custom_query=custom_query,
+                index=index,
+                headers=headers,
+                all_terms_must_match=all_terms_must_match,
+                scale_score=scale_score,
+            )
+            documents.append(cur_docs)
+
+        return documents
+
    def query_by_embedding(
        self,
        query_emb: np.ndarray,
--- a/test/document_stores/test_weaviate.py
+++ b/test/document_stores/test_weaviate.py
@ -184,13 +184,13 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract):
        # docs = ds.query(query_text, filters={"name": ["name_1"]})
        # assert len(docs) == 1

-        docs = ds.query(filters={"name": ["name_0"]})
+        docs = ds.query(query=None, filters={"name": ["name_0"]})
        assert len(docs) == 3

-        docs = ds.query(filters={"content": [query_text.lower()]})
+        docs = ds.query(query=None, filters={"content": [query_text.lower()]})
        assert len(docs) == 3

-        docs = ds.query(filters={"content": ["baz"]})
+        docs = ds.query(query=None, filters={"content": ["baz"]})
        assert len(docs) == 3

    @pytest.mark.integration
--- a/test/nodes/test_retriever.py
+++ b/test/nodes/test_retriever.py
@ -47,6 +47,8 @@ from ..conftest import SAMPLES_PATH, MockRetriever
        ("embedding", "milvus"),
        ("bm25", "elasticsearch"),
        ("bm25", "memory"),
+        ("bm25", "weaviate"),
+        ("es_filter_only", "elasticsearch"),
        ("tfidf", "memory"),
    ],
    indirect=True,
@ -55,10 +57,23 @@ def test_retrieval_without_filters(retriever_with_docs: BaseRetriever, document_
    if not isinstance(retriever_with_docs, (BM25Retriever, TfidfRetriever)):
        document_store_with_docs.update_embeddings(retriever_with_docs)

-    res = retriever_with_docs.retrieve(query="Who lives in Berlin?")
-    assert res[0].content == "My name is Carla and I live in Berlin"
-    assert len(res) == 5
-    assert res[0].meta["name"] == "filename1"
+    # NOTE: FilterRetriever simply returns all documents matching a filter,
+    # so without filters applied it does nothing
+    if not isinstance(retriever_with_docs, FilterRetriever):
+        # the BM25 implementation in Weaviate would NOT pick up the expected records
+        # just with the "Who lives in Berlin?" query, but would return empty results,
+        # (maybe live & Berlin are stopwords in Weaviate? :-) ), so for Weaviate we need a query with better matching
+        # This was caused by lack of stemming and casing in Weaviate BM25 implementation
+        # TODO - In Weaviate 1.17.0 there is a fix for the lack of casing, which means that once 1.17.0 is released
+        # this `if` can be removed, as the standard search query "Who lives in Berlin?" should work with Weaviate.
+        # See https://github.com/semi-technologies/weaviate/issues/2455#issuecomment-1355702003
+        if isinstance(document_store_with_docs, WeaviateDocumentStore):
+            res = retriever_with_docs.retrieve(query="name is Carla, I live in Berlin")
+        else:
+            res = retriever_with_docs.retrieve(query="Who lives in Berlin?")
+        assert res[0].content == "My name is Carla and I live in Berlin"
+        assert len(res) == 5
+        assert res[0].meta["name"] == "filename1"


@pytest.mark.parametrize(
@ -71,6 +86,8 @@ def test_retrieval_without_filters(retriever_with_docs: BaseRetriever, document_
        ("embedding", "elasticsearch"),
        ("embedding", "memory"),
        ("bm25", "elasticsearch"),
+        # TODO - add once Weaviate starts supporting filters with BM25 in Weaviate v1.18+
+        # ("bm25", "weaviate"),
        ("es_filter_only", "elasticsearch"),
    ],
    indirect=True,
@ -197,6 +214,14 @@ def test_batch_retrieval_multiple_queries_with_filters(retriever_with_docs, docu
    if not isinstance(retriever_with_docs, (BM25Retriever, FilterRetriever)):
        document_store_with_docs.update_embeddings(retriever_with_docs)

+    # Weaviate does not support BM25 with filters yet, only after Weaviate v1.18.0
+    # TODO - remove this once Weaviate starts supporting BM25 WITH filters
+    # You might also need to modify the first query, as Weaviate having problems with
+    # retrieving the "My name is Carla and I live in Berlin" record just with the
+    # "Who lives in Berlin?" BM25 query
+    if isinstance(document_store_with_docs, WeaviateDocumentStore):
+        return
+
    res = retriever_with_docs.retrieve_batch(
        queries=["Who lives in Berlin?", "Who lives in New York?"], filters=[{"name": "filename1"}, None]
    )