refactor: use weaviate client to build BM25 query (#3939)

* refactor: use weaviate client to build BM25 query

* refactor: remove manual BM25 query building

* refactor: apply BM25 to the content_field only

* test: update weaviate BM25 retrieval test case

update to account for lack of stemming

---------

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
This commit is contained in:
hsm207 2023-01-30 10:07:07 +01:00 committed by GitHub
parent a0d7817dd5
commit 08ec059b14
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 6 additions and 20 deletions

View File

@ -1020,24 +1020,11 @@ class WeaviateDocumentStore(KeywordDocumentStore):
# BM25 retrieval without filtering
gql_query = (
gql.get.GetBuilder(class_name=index, properties=properties, connection=self.weaviate_client)
.with_near_vector({"vector": [0, 0]})
.with_limit(top_k)
.with_bm25({"query": query, "properties": self.content_field})
.build()
)
# Build the BM25 part of the GQL manually.
# Currently the GetBuilder of the Weaviate-client (v3.6.0)
# does not support the BM25 part of GQL building, so
# the BM25 part needs to be added manually.
# The BM25 query needs to be provided all lowercase while
# the functionality is in experimental mode in Weaviate,
# see https://app.slack.com/client/T0181DYT9KN/C017EG2SL3H/thread/C017EG2SL3H-1658790227.208119
bm25_gql_query = f"""bm25: {{
query: "{query.replace('"', ' ').lower()}",
properties: ["{self.content_field}"]
}}"""
gql_query = gql_query.replace("nearVector: {vector: [0, 0]}", bm25_gql_query)
query_output = self.weaviate_client.query.raw(gql_query)
results = []

View File

@ -59,14 +59,13 @@ def test_retrieval_without_filters(retriever_with_docs: BaseRetriever, document_
# so without filters applied it does nothing
if not isinstance(retriever_with_docs, FilterRetriever):
# the BM25 implementation in Weaviate would NOT pick up the expected records
# just with the "Who lives in Berlin?" query, but would return empty results,
# (maybe live & Berlin are stopwords in Weaviate? :-) ), so for Weaviate we need a query with better matching
# This was caused by lack of stemming and casing in Weaviate BM25 implementation
# TODO - In Weaviate 1.17.0 there is a fix for the lack of casing, which means that once 1.17.0 is released
# because of the lack of stemming: "Who lives in berlin" returns only 1 record while
# "Who live in berlin" returns all 5 records.
# TODO - In Weaviate 1.19.0 there is a fix for the lack of stemming, which means that once 1.19.0 is released
# this `if` can be removed, as the standard search query "Who lives in Berlin?" should work with Weaviate.
# See https://github.com/semi-technologies/weaviate/issues/2455#issuecomment-1355702003
# See https://github.com/weaviate/weaviate/issues/2439
if isinstance(document_store_with_docs, WeaviateDocumentStore):
res = retriever_with_docs.retrieve(query="name is Carla, I live in Berlin")
res = retriever_with_docs.retrieve(query="Who live in berlin")
else:
res = retriever_with_docs.retrieve(query="Who lives in Berlin?")
assert res[0].content == "My name is Carla and I live in Berlin"