diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py index 37769e71d..1b0e5ad75 100644 --- a/haystack/document_stores/weaviate.py +++ b/haystack/document_stores/weaviate.py @@ -18,11 +18,11 @@ except (ImportError, ModuleNotFoundError) as ie: _optional_component_not_installed(__name__, "weaviate", ie) from haystack.schema import Document, FilterType, Label -from haystack.document_stores import BaseDocumentStore +from haystack.document_stores import KeywordDocumentStore from haystack.document_stores.base import get_batches_from_generator from haystack.document_stores.filter_utils import LogicalFilterClause from haystack.document_stores.utils import convert_date_to_rfc3339 -from haystack.errors import DocumentStoreError +from haystack.errors import DocumentStoreError, HaystackError from haystack.nodes.retriever import DenseRetriever @@ -34,7 +34,7 @@ class WeaviateDocumentStoreError(DocumentStoreError): pass -class WeaviateDocumentStore(BaseDocumentStore): +class WeaviateDocumentStore(KeywordDocumentStore): """ Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models. @@ -849,13 +849,13 @@ class WeaviateDocumentStore(BaseDocumentStore): def query( self, - query: Optional[str] = None, + query: Optional[str], filters: Optional[FilterType] = None, top_k: int = 10, - all_terms_must_match: bool = False, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, + all_terms_must_match: bool = False, scale_score: bool = True, ) -> List[Document]: """ @@ -1034,6 +1034,134 @@ class WeaviateDocumentStore(BaseDocumentStore): return documents + def query_batch( + self, + queries: List[str], + filters: Optional[Union[FilterType, List[Optional[FilterType]]]] = None, + top_k: int = 10, + custom_query: Optional[str] = None, + index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + all_terms_must_match: bool = False, + scale_score: bool = True, + ) -> List[List[Document]]: + """ + Scan through documents in DocumentStore and return a small number documents + that are most relevant to the provided queries as defined by keyword matching algorithms like BM25. + + This method lets you find relevant documents for a single query string (output: List of Documents), or a + a list of query strings (output: List of Lists of Documents). + + :param queries: Single query or list of queries. + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + + __Example__: + + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` + + :param top_k: How many documents to return per query. + :param custom_query: Custom query to be executed. + :param index: The name of the index in the DocumentStore from which to retrieve documents + :param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + :param all_terms_must_match: Whether all terms of the query must match the document. + If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). + Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). + Defaults to False. + :param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]). + If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. + Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + """ + # TODO - This method currently just calls query multiple times. Adapt this once there is a batch querying + # endpoint in Weaviate, which is currently not available, + # see https://stackoverflow.com/questions/71558676/does-weaviate-support-bulk-query#comment126569547_71561939 + + documents = [] + + if isinstance(filters, list): + if len(filters) != len(queries): + raise HaystackError( + "Number of filters does not match number of queries. Please provide as many filters" + " as queries or a single filter that will be applied to each query." + ) + else: + filters = [filters] * len(queries) if filters is not None else [{}] * len(queries) + + # run each query against Weaviate separately and combine the returned documents + for query, cur_filters in zip(queries, filters): + cur_docs = self.query( + query=query, + filters=cur_filters, + top_k=top_k, + custom_query=custom_query, + index=index, + headers=headers, + all_terms_must_match=all_terms_must_match, + scale_score=scale_score, + ) + documents.append(cur_docs) + + return documents + def query_by_embedding( self, query_emb: np.ndarray, diff --git a/test/document_stores/test_weaviate.py b/test/document_stores/test_weaviate.py index 80c206345..2a623efd3 100644 --- a/test/document_stores/test_weaviate.py +++ b/test/document_stores/test_weaviate.py @@ -184,13 +184,13 @@ class TestWeaviateDocumentStore(DocumentStoreBaseTestAbstract): # docs = ds.query(query_text, filters={"name": ["name_1"]}) # assert len(docs) == 1 - docs = ds.query(filters={"name": ["name_0"]}) + docs = ds.query(query=None, filters={"name": ["name_0"]}) assert len(docs) == 3 - docs = ds.query(filters={"content": [query_text.lower()]}) + docs = ds.query(query=None, filters={"content": [query_text.lower()]}) assert len(docs) == 3 - docs = ds.query(filters={"content": ["baz"]}) + docs = ds.query(query=None, filters={"content": ["baz"]}) assert len(docs) == 3 @pytest.mark.integration diff --git a/test/nodes/test_retriever.py b/test/nodes/test_retriever.py index 7def53480..fea647d49 100644 --- a/test/nodes/test_retriever.py +++ b/test/nodes/test_retriever.py @@ -47,6 +47,8 @@ from ..conftest import SAMPLES_PATH, MockRetriever ("embedding", "milvus"), ("bm25", "elasticsearch"), ("bm25", "memory"), + ("bm25", "weaviate"), + ("es_filter_only", "elasticsearch"), ("tfidf", "memory"), ], indirect=True, @@ -55,10 +57,23 @@ def test_retrieval_without_filters(retriever_with_docs: BaseRetriever, document_ if not isinstance(retriever_with_docs, (BM25Retriever, TfidfRetriever)): document_store_with_docs.update_embeddings(retriever_with_docs) - res = retriever_with_docs.retrieve(query="Who lives in Berlin?") - assert res[0].content == "My name is Carla and I live in Berlin" - assert len(res) == 5 - assert res[0].meta["name"] == "filename1" + # NOTE: FilterRetriever simply returns all documents matching a filter, + # so without filters applied it does nothing + if not isinstance(retriever_with_docs, FilterRetriever): + # the BM25 implementation in Weaviate would NOT pick up the expected records + # just with the "Who lives in Berlin?" query, but would return empty results, + # (maybe live & Berlin are stopwords in Weaviate? :-) ), so for Weaviate we need a query with better matching + # This was caused by lack of stemming and casing in Weaviate BM25 implementation + # TODO - In Weaviate 1.17.0 there is a fix for the lack of casing, which means that once 1.17.0 is released + # this `if` can be removed, as the standard search query "Who lives in Berlin?" should work with Weaviate. + # See https://github.com/semi-technologies/weaviate/issues/2455#issuecomment-1355702003 + if isinstance(document_store_with_docs, WeaviateDocumentStore): + res = retriever_with_docs.retrieve(query="name is Carla, I live in Berlin") + else: + res = retriever_with_docs.retrieve(query="Who lives in Berlin?") + assert res[0].content == "My name is Carla and I live in Berlin" + assert len(res) == 5 + assert res[0].meta["name"] == "filename1" @pytest.mark.parametrize( @@ -71,6 +86,8 @@ def test_retrieval_without_filters(retriever_with_docs: BaseRetriever, document_ ("embedding", "elasticsearch"), ("embedding", "memory"), ("bm25", "elasticsearch"), + # TODO - add once Weaviate starts supporting filters with BM25 in Weaviate v1.18+ + # ("bm25", "weaviate"), ("es_filter_only", "elasticsearch"), ], indirect=True, @@ -197,6 +214,14 @@ def test_batch_retrieval_multiple_queries_with_filters(retriever_with_docs, docu if not isinstance(retriever_with_docs, (BM25Retriever, FilterRetriever)): document_store_with_docs.update_embeddings(retriever_with_docs) + # Weaviate does not support BM25 with filters yet, only after Weaviate v1.18.0 + # TODO - remove this once Weaviate starts supporting BM25 WITH filters + # You might also need to modify the first query, as Weaviate having problems with + # retrieving the "My name is Carla and I live in Berlin" record just with the + # "Who lives in Berlin?" BM25 query + if isinstance(document_store_with_docs, WeaviateDocumentStore): + return + res = retriever_with_docs.retrieve_batch( queries=["Who lives in Berlin?", "Who lives in New York?"], filters=[{"name": "filename1"}, None] )