mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-03 18:36:04 +00:00
refactor: use weaviate client to build BM25 query (#3939)
* refactor: use weaviate client to build BM25 query * refactor: remove manual BM25 query building * refactor: apply BM25 to the content_field only * test: update weaviate BM25 retrieval test case update to account for lack of stemming --------- Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
This commit is contained in:
parent
a0d7817dd5
commit
08ec059b14
@ -1020,24 +1020,11 @@ class WeaviateDocumentStore(KeywordDocumentStore):
|
||||
# BM25 retrieval without filtering
|
||||
gql_query = (
|
||||
gql.get.GetBuilder(class_name=index, properties=properties, connection=self.weaviate_client)
|
||||
.with_near_vector({"vector": [0, 0]})
|
||||
.with_limit(top_k)
|
||||
.with_bm25({"query": query, "properties": self.content_field})
|
||||
.build()
|
||||
)
|
||||
|
||||
# Build the BM25 part of the GQL manually.
|
||||
# Currently the GetBuilder of the Weaviate-client (v3.6.0)
|
||||
# does not support the BM25 part of GQL building, so
|
||||
# the BM25 part needs to be added manually.
|
||||
# The BM25 query needs to be provided all lowercase while
|
||||
# the functionality is in experimental mode in Weaviate,
|
||||
# see https://app.slack.com/client/T0181DYT9KN/C017EG2SL3H/thread/C017EG2SL3H-1658790227.208119
|
||||
bm25_gql_query = f"""bm25: {{
|
||||
query: "{query.replace('"', ' ').lower()}",
|
||||
properties: ["{self.content_field}"]
|
||||
}}"""
|
||||
gql_query = gql_query.replace("nearVector: {vector: [0, 0]}", bm25_gql_query)
|
||||
|
||||
query_output = self.weaviate_client.query.raw(gql_query)
|
||||
|
||||
results = []
|
||||
|
||||
@ -59,14 +59,13 @@ def test_retrieval_without_filters(retriever_with_docs: BaseRetriever, document_
|
||||
# so without filters applied it does nothing
|
||||
if not isinstance(retriever_with_docs, FilterRetriever):
|
||||
# the BM25 implementation in Weaviate would NOT pick up the expected records
|
||||
# just with the "Who lives in Berlin?" query, but would return empty results,
|
||||
# (maybe live & Berlin are stopwords in Weaviate? :-) ), so for Weaviate we need a query with better matching
|
||||
# This was caused by lack of stemming and casing in Weaviate BM25 implementation
|
||||
# TODO - In Weaviate 1.17.0 there is a fix for the lack of casing, which means that once 1.17.0 is released
|
||||
# because of the lack of stemming: "Who lives in berlin" returns only 1 record while
|
||||
# "Who live in berlin" returns all 5 records.
|
||||
# TODO - In Weaviate 1.19.0 there is a fix for the lack of stemming, which means that once 1.19.0 is released
|
||||
# this `if` can be removed, as the standard search query "Who lives in Berlin?" should work with Weaviate.
|
||||
# See https://github.com/semi-technologies/weaviate/issues/2455#issuecomment-1355702003
|
||||
# See https://github.com/weaviate/weaviate/issues/2439
|
||||
if isinstance(document_store_with_docs, WeaviateDocumentStore):
|
||||
res = retriever_with_docs.retrieve(query="name is Carla, I live in Berlin")
|
||||
res = retriever_with_docs.retrieve(query="Who live in berlin")
|
||||
else:
|
||||
res = retriever_with_docs.retrieve(query="Who lives in Berlin?")
|
||||
assert res[0].content == "My name is Carla and I live in Berlin"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user