mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-03 03:09:28 +00:00
* Fixing broken BM25 support with Weaviate - fixes #3720 Unfortunately the BM25 support with Weaviate got broken with Haystack v1.11.0+, which is getting fixed with this commit. Please see more under issue #3720. * Fixing mypy issue - method signature wasn't matching the base class * Mypy related test fix Mypy forced me to set the signature of the `query` method of the Weaviate document store to the same as its parent, the `KeywordDocumentStore`, where the `query` parame is `Optional`, but has NO default value, so it must be provided (as None) at runtime. I am not quite sure why the abstract method's `query` param was set without a default value while its type is `Optional`, but I didn't want to change that, so instead I have changed the Weaviate tests. * Adding a note regarding an upcomming fix in Weaviate v1.17.0 * Apply suggestions from code review * revert * [EMPTY] Re-trigger CI
This commit is contained in:
parent
56803e5465
commit
e143f7cc36
@ -18,11 +18,11 @@ except (ImportError, ModuleNotFoundError) as ie:
|
||||
_optional_component_not_installed(__name__, "weaviate", ie)
|
||||
|
||||
from haystack.schema import Document, FilterType, Label
|
||||
from haystack.document_stores import BaseDocumentStore
|
||||
from haystack.document_stores import KeywordDocumentStore
|
||||
from haystack.document_stores.base import get_batches_from_generator
|
||||
from haystack.document_stores.filter_utils import LogicalFilterClause
|
||||
from haystack.document_stores.utils import convert_date_to_rfc3339
|
||||
from haystack.errors import DocumentStoreError
|
||||
from haystack.errors import DocumentStoreError, HaystackError
|
||||
from haystack.nodes.retriever import DenseRetriever
|
||||
|
||||
|
||||
@ -34,7 +34,7 @@ class WeaviateDocumentStoreError(DocumentStoreError):
|
||||
pass
|
||||
|
||||
|
||||
class WeaviateDocumentStore(BaseDocumentStore):
|
||||
class WeaviateDocumentStore(KeywordDocumentStore):
|
||||
"""
|
||||
|
||||
Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models.
|
||||
@ -849,13 +849,13 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
|
||||
def query(
|
||||
self,
|
||||
query: Optional[str] = None,
|
||||
query: Optional[str],
|
||||
filters: Optional[FilterType] = None,
|
||||
top_k: int = 10,
|
||||
all_terms_must_match: bool = False,
|
||||
custom_query: Optional[str] = None,
|
||||
index: Optional[str] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
all_terms_must_match: bool = False,
|
||||
scale_score: bool = True,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
@ -1034,6 +1034,134 @@ class WeaviateDocumentStore(BaseDocumentStore):
|
||||
|
||||
return documents
|
||||
|
||||
def query_batch(
|
||||
self,
|
||||
queries: List[str],
|
||||
filters: Optional[Union[FilterType, List[Optional[FilterType]]]] = None,
|
||||
top_k: int = 10,
|
||||
custom_query: Optional[str] = None,
|
||||
index: Optional[str] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
all_terms_must_match: bool = False,
|
||||
scale_score: bool = True,
|
||||
) -> List[List[Document]]:
|
||||
"""
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
that are most relevant to the provided queries as defined by keyword matching algorithms like BM25.
|
||||
|
||||
This method lets you find relevant documents for a single query string (output: List of Documents), or a
|
||||
a list of query strings (output: List of Lists of Documents).
|
||||
|
||||
:param queries: Single query or list of queries.
|
||||
:param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain
|
||||
conditions.
|
||||
Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
|
||||
operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
|
||||
`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
|
||||
Logical operator keys take a dictionary of metadata field names and/or logical operators as
|
||||
value. Metadata field names take a dictionary of comparison operators as value. Comparison
|
||||
operator keys take a single value or (in case of `"$in"`) a list of values as value.
|
||||
If no logical operator is provided, `"$and"` is used as default operation. If no comparison
|
||||
operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
|
||||
operation.
|
||||
|
||||
__Example__:
|
||||
|
||||
```python
|
||||
filters = {
|
||||
"$and": {
|
||||
"type": {"$eq": "article"},
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": {"$in": ["economy", "politics"]},
|
||||
"publisher": {"$eq": "nytimes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
# or simpler using default operators
|
||||
filters = {
|
||||
"type": "article",
|
||||
"date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
|
||||
"rating": {"$gte": 3},
|
||||
"$or": {
|
||||
"genre": ["economy", "politics"],
|
||||
"publisher": "nytimes"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
To use the same logical operator multiple times on the same level, logical operators take
|
||||
optionally a list of dictionaries as value.
|
||||
|
||||
__Example__:
|
||||
|
||||
```python
|
||||
filters = {
|
||||
"$or": [
|
||||
{
|
||||
"$and": {
|
||||
"Type": "News Paper",
|
||||
"Date": {
|
||||
"$lt": "2019-01-01"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$and": {
|
||||
"Type": "Blog Post",
|
||||
"Date": {
|
||||
"$gte": "2019-01-01"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
:param top_k: How many documents to return per query.
|
||||
:param custom_query: Custom query to be executed.
|
||||
:param index: The name of the index in the DocumentStore from which to retrieve documents
|
||||
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
|
||||
:param all_terms_must_match: Whether all terms of the query must match the document.
|
||||
If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
|
||||
Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
|
||||
Defaults to False.
|
||||
:param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]).
|
||||
If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant.
|
||||
Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||
"""
|
||||
# TODO - This method currently just calls query multiple times. Adapt this once there is a batch querying
|
||||
# endpoint in Weaviate, which is currently not available,
|
||||
# see https://stackoverflow.com/questions/71558676/does-weaviate-support-bulk-query#comment126569547_71561939
|
||||
|
||||
documents = []
|
||||
|
||||
if isinstance(filters, list):
|
||||
if len(filters) != len(queries):
|
||||
raise HaystackError(
|
||||
"Number of filters does not match number of queries. Please provide as many filters"
|
||||
" as queries or a single filter that will be applied to each query."
|
||||
)
|
||||
else:
|
||||
filters = [filters] * len(queries) if filters is not None else [{}] * len(queries)
|
||||
|
||||
# run each query against Weaviate separately and combine the returned documents
|
||||
for query, cur_filters in zip(queries, filters):
|
||||
cur_docs = self.query(
|
||||
query=query,
|
||||
filters=cur_filters,
|
||||
top_k=top_k,
|
||||
custom_query=custom_query,
|
||||
index=index,
|
||||
headers=headers,
|
||||
all_terms_must_match=all_terms_must_match,
|
||||
scale_score=scale_score,
|
||||
)
|
||||
documents.append(cur_docs)
|
||||
|
||||
return documents
|
||||
|
||||
def query_by_embedding(
|
||||
self,
|
||||
query_emb: np.ndarray,
|
||||
|
||||
@ -184,13 +184,13 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract):
|
||||
# docs = ds.query(query_text, filters={"name": ["name_1"]})
|
||||
# assert len(docs) == 1
|
||||
|
||||
docs = ds.query(filters={"name": ["name_0"]})
|
||||
docs = ds.query(query=None, filters={"name": ["name_0"]})
|
||||
assert len(docs) == 3
|
||||
|
||||
docs = ds.query(filters={"content": [query_text.lower()]})
|
||||
docs = ds.query(query=None, filters={"content": [query_text.lower()]})
|
||||
assert len(docs) == 3
|
||||
|
||||
docs = ds.query(filters={"content": ["baz"]})
|
||||
docs = ds.query(query=None, filters={"content": ["baz"]})
|
||||
assert len(docs) == 3
|
||||
|
||||
@pytest.mark.integration
|
||||
|
||||
@ -47,6 +47,8 @@ from ..conftest import SAMPLES_PATH, MockRetriever
|
||||
("embedding", "milvus"),
|
||||
("bm25", "elasticsearch"),
|
||||
("bm25", "memory"),
|
||||
("bm25", "weaviate"),
|
||||
("es_filter_only", "elasticsearch"),
|
||||
("tfidf", "memory"),
|
||||
],
|
||||
indirect=True,
|
||||
@ -55,10 +57,23 @@ def test_retrieval_without_filters(retriever_with_docs: BaseRetriever, document_
|
||||
if not isinstance(retriever_with_docs, (BM25Retriever, TfidfRetriever)):
|
||||
document_store_with_docs.update_embeddings(retriever_with_docs)
|
||||
|
||||
res = retriever_with_docs.retrieve(query="Who lives in Berlin?")
|
||||
assert res[0].content == "My name is Carla and I live in Berlin"
|
||||
assert len(res) == 5
|
||||
assert res[0].meta["name"] == "filename1"
|
||||
# NOTE: FilterRetriever simply returns all documents matching a filter,
|
||||
# so without filters applied it does nothing
|
||||
if not isinstance(retriever_with_docs, FilterRetriever):
|
||||
# the BM25 implementation in Weaviate would NOT pick up the expected records
|
||||
# just with the "Who lives in Berlin?" query, but would return empty results,
|
||||
# (maybe live & Berlin are stopwords in Weaviate? :-) ), so for Weaviate we need a query with better matching
|
||||
# This was caused by lack of stemming and casing in Weaviate BM25 implementation
|
||||
# TODO - In Weaviate 1.17.0 there is a fix for the lack of casing, which means that once 1.17.0 is released
|
||||
# this `if` can be removed, as the standard search query "Who lives in Berlin?" should work with Weaviate.
|
||||
# See https://github.com/semi-technologies/weaviate/issues/2455#issuecomment-1355702003
|
||||
if isinstance(document_store_with_docs, WeaviateDocumentStore):
|
||||
res = retriever_with_docs.retrieve(query="name is Carla, I live in Berlin")
|
||||
else:
|
||||
res = retriever_with_docs.retrieve(query="Who lives in Berlin?")
|
||||
assert res[0].content == "My name is Carla and I live in Berlin"
|
||||
assert len(res) == 5
|
||||
assert res[0].meta["name"] == "filename1"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -71,6 +86,8 @@ def test_retrieval_without_filters(retriever_with_docs: BaseRetriever, document_
|
||||
("embedding", "elasticsearch"),
|
||||
("embedding", "memory"),
|
||||
("bm25", "elasticsearch"),
|
||||
# TODO - add once Weaviate starts supporting filters with BM25 in Weaviate v1.18+
|
||||
# ("bm25", "weaviate"),
|
||||
("es_filter_only", "elasticsearch"),
|
||||
],
|
||||
indirect=True,
|
||||
@ -197,6 +214,14 @@ def test_batch_retrieval_multiple_queries_with_filters(retriever_with_docs, docu
|
||||
if not isinstance(retriever_with_docs, (BM25Retriever, FilterRetriever)):
|
||||
document_store_with_docs.update_embeddings(retriever_with_docs)
|
||||
|
||||
# Weaviate does not support BM25 with filters yet, only after Weaviate v1.18.0
|
||||
# TODO - remove this once Weaviate starts supporting BM25 WITH filters
|
||||
# You might also need to modify the first query, as Weaviate having problems with
|
||||
# retrieving the "My name is Carla and I live in Berlin" record just with the
|
||||
# "Who lives in Berlin?" BM25 query
|
||||
if isinstance(document_store_with_docs, WeaviateDocumentStore):
|
||||
return
|
||||
|
||||
res = retriever_with_docs.retrieve_batch(
|
||||
queries=["Who lives in Berlin?", "Who lives in New York?"], filters=[{"name": "filename1"}, None]
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user