From 08ec059b14ba12cbf8d44f529855d87cbdea4ab7 Mon Sep 17 00:00:00 2001 From: hsm207 Date: Mon, 30 Jan 2023 10:07:07 +0100 Subject: [PATCH] refactor: use weaviate client to build BM25 query (#3939) * refactor: use weaviate client to build BM25 query * refactor: remove manual BM25 query building * refactor: apply BM25 to the content_field only * test: update weaviate BM25 retrieval test case update to account for lack of stemming --------- Co-authored-by: Massimiliano Pippi --- haystack/document_stores/weaviate.py | 15 +-------------- test/nodes/test_retriever.py | 11 +++++------ 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py index 31d4e03aa..b7a1733c8 100644 --- a/haystack/document_stores/weaviate.py +++ b/haystack/document_stores/weaviate.py @@ -1020,24 +1020,11 @@ class WeaviateDocumentStore(KeywordDocumentStore): # BM25 retrieval without filtering gql_query = ( gql.get.GetBuilder(class_name=index, properties=properties, connection=self.weaviate_client) - .with_near_vector({"vector": [0, 0]}) .with_limit(top_k) + .with_bm25({"query": query, "properties": self.content_field}) .build() ) - # Build the BM25 part of the GQL manually. - # Currently the GetBuilder of the Weaviate-client (v3.6.0) - # does not support the BM25 part of GQL building, so - # the BM25 part needs to be added manually. - # The BM25 query needs to be provided all lowercase while - # the functionality is in experimental mode in Weaviate, - # see https://app.slack.com/client/T0181DYT9KN/C017EG2SL3H/thread/C017EG2SL3H-1658790227.208119 - bm25_gql_query = f"""bm25: {{ - query: "{query.replace('"', ' ').lower()}", - properties: ["{self.content_field}"] - }}""" - gql_query = gql_query.replace("nearVector: {vector: [0, 0]}", bm25_gql_query) - query_output = self.weaviate_client.query.raw(gql_query) results = [] diff --git a/test/nodes/test_retriever.py b/test/nodes/test_retriever.py index b7cd3cbc7..5479a7d81 100644 --- a/test/nodes/test_retriever.py +++ b/test/nodes/test_retriever.py @@ -59,14 +59,13 @@ def test_retrieval_without_filters(retriever_with_docs: BaseRetriever, document_ # so without filters applied it does nothing if not isinstance(retriever_with_docs, FilterRetriever): # the BM25 implementation in Weaviate would NOT pick up the expected records - # just with the "Who lives in Berlin?" query, but would return empty results, - # (maybe live & Berlin are stopwords in Weaviate? :-) ), so for Weaviate we need a query with better matching - # This was caused by lack of stemming and casing in Weaviate BM25 implementation - # TODO - In Weaviate 1.17.0 there is a fix for the lack of casing, which means that once 1.17.0 is released + # because of the lack of stemming: "Who lives in berlin" returns only 1 record while + # "Who live in berlin" returns all 5 records. + # TODO - In Weaviate 1.19.0 there is a fix for the lack of stemming, which means that once 1.19.0 is released # this `if` can be removed, as the standard search query "Who lives in Berlin?" should work with Weaviate. - # See https://github.com/semi-technologies/weaviate/issues/2455#issuecomment-1355702003 + # See https://github.com/weaviate/weaviate/issues/2439 if isinstance(document_store_with_docs, WeaviateDocumentStore): - res = retriever_with_docs.retrieve(query="name is Carla, I live in Berlin") + res = retriever_with_docs.retrieve(query="Who live in berlin") else: res = retriever_with_docs.retrieve(query="Who lives in Berlin?") assert res[0].content == "My name is Carla and I live in Berlin"