diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8c561d893..02e0191a8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -369,7 +369,7 @@ jobs: uses: ./.github/actions/python_cache/ - name: Setup Weaviate - run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0 + run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --env ENABLE_EXPERIMENTAL_BM25='true' semitechnologies/weaviate:1.14.1 # TODO Let's try to remove this one from the unit tests - name: Install pdftotext @@ -401,7 +401,7 @@ jobs: # prefix: windows # - name: Setup Weaviate - # run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0 + # run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --env ENABLE_EXPERIMENTAL_BM25='true' semitechnologies/weaviate:1.14.1 # - name: Install pdftotext # run: | @@ -540,7 +540,7 @@ jobs: sudo docker-compose ps - name: Run Weaviate - run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0 + run: docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --env ENABLE_EXPERIMENTAL_BM25='true' semitechnologies/weaviate:1.14.1 - name: Run GraphDB run: docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f12e24fc1..92a7853fe 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -170,7 +170,7 @@ wget https://github.com/milvus-io/milvus/releases/download/v2.0.0/milvus-standal docker-compose up -d # Weaviate -docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' semitechnologies/weaviate:1.11.0 +docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --env ENABLE_EXPERIMENTAL_BM25='true' semitechnologies/weaviate:1.14.1 # GraphDB docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11 diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index e7d9bba9e..4c71fac88 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -3689,7 +3689,7 @@ operation. #### WeaviateDocumentStore.query ```python -def query(query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, scale_score: bool = True) -> List[Document] +def query(query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, all_terms_must_match: bool = False, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -3763,9 +3763,11 @@ operation. } ``` - `top_k`: How many documents to return per query. +- `all_terms_must_match`: Not used in Weaviate. - `custom_query`: Custom query that will executed using query.raw method, for more details refer https://weaviate.io/developers/weaviate/current/graphql-references/filters.html - `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Not used in Weaviate. - `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py index f24c83ce6..c845a5925 100644 --- a/haystack/document_stores/weaviate.py +++ b/haystack/document_stores/weaviate.py @@ -11,7 +11,7 @@ from tqdm import tqdm try: import weaviate - from weaviate import client, AuthClientPassword + from weaviate import client, AuthClientPassword, gql except (ImportError, ModuleNotFoundError) as ie: from haystack.utils.import_utils import _optional_component_not_installed @@ -814,8 +814,10 @@ class WeaviateDocumentStore(BaseDocumentStore): query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, + all_terms_must_match: bool = False, custom_query: Optional[str] = None, index: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, scale_score: bool = True, ) -> List[Document]: """ @@ -887,36 +889,95 @@ class WeaviateDocumentStore(BaseDocumentStore): } ``` :param top_k: How many documents to return per query. + :param all_terms_must_match: Not used in Weaviate. :param custom_query: Custom query that will executed using query.raw method, for more details refer https://weaviate.io/developers/weaviate/current/graphql-references/filters.html :param index: The name of the index in the DocumentStore from which to retrieve documents + :param headers: Not used in Weaviate. :param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]). If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. """ + if headers: + raise NotImplementedError("Weaviate does not support Custom HTTP headers!") + + if all_terms_must_match: + raise NotImplementedError("The `all_terms_must_match` option is not supported in Weaviate!") + index = self._sanitize_index_name(index) or self.index # Build the properties to retrieve from Weaviate properties = self._get_current_properties(index) properties.append("_additional {id, certainty, vector}") - if custom_query: - query_output = self.weaviate_client.query.raw(custom_query) - elif filters: - filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate() - query_output = ( - self.weaviate_client.query.get(class_name=index, properties=properties) - .with_where(filter_dict) - .with_limit(top_k) - .do() - ) + if query is None: + + # Retrieval via custom query, no BM25 + if custom_query: + query_output = self.weaviate_client.query.raw(custom_query) + + # Naive retrieval without BM25, only filtering + elif filters: + filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate() + query_output = ( + self.weaviate_client.query.get(class_name=index, properties=properties) + .with_where(filter_dict) + .with_limit(top_k) + .do() + ) + else: + raise NotImplementedError( + "Weaviate does not support the retrieval of records without specifying a query or a filter!" + ) + + # Default Retrieval via BM25 using the user's query on `self.content_field` else: - raise NotImplementedError( - "Weaviate does not support inverted index text query. However, " - "it allows to search by filters example : {'content': 'some text'} or " - "use a custom GraphQL query in text format!" + logger.warning( + "As of v1.14.1 Weaviate's BM25 retrieval is still in experimental phase, " + "so use it with care! To turn on the BM25 experimental feature in Weaviate " + "you need to start it with the `ENABLE_EXPERIMENTAL_BM25='true'` " + "environmental variable." ) + # Retrieval with BM25 AND filtering + if filters: + raise NotImplementedError( + "Weaviate currently (v1.14.1) does not support filters WITH inverted index text query (eg BM25)!" + ) + + # Once Weaviate starts supporting filters with BM25: + # filter_dict = LogicalFilterClause.parse(filters).convert_to_weaviate() + # gql_query = weaviate.gql.get.GetBuilder(class_name=index, + # properties=properties, + # connection=self.weaviate_client) \ + # .with_near_vector({'vector': [0, 0]}) \ + # .with_where(filter_dict) \ + # .with_limit(top_k) \ + # .build() + + # BM25 retrieval without filtering + gql_query = ( + gql.get.GetBuilder(class_name=index, properties=properties, connection=self.weaviate_client) + .with_near_vector({"vector": [0, 0]}) + .with_limit(top_k) + .build() + ) + + # Build the BM25 part of the GQL manually. + # Currently the GetBuilder of the Weaviate-client (v3.6.0) + # does not support the BM25 part of GQL building, so + # the BM25 part needs to be added manually. + # The BM25 query needs to be provided all lowercase while + # the functionality is in experimental mode in Weaviate, + # see https://app.slack.com/client/T0181DYT9KN/C017EG2SL3H/thread/C017EG2SL3H-1658790227.208119 + bm25_gql_query = f"""bm25: {{ + query: "{query.replace('"', ' ').lower()}", + properties: ["{self.content_field}"] + }}""" + gql_query = gql_query.replace("nearVector: {vector: [0, 0]}", bm25_gql_query) + + query_output = self.weaviate_client.query.raw(gql_query) + results = [] if query_output and "data" in query_output and "Get" in query_output.get("data"): if query_output.get("data").get("Get").get(index): diff --git a/setup.cfg b/setup.cfg index 6b40c8d2f..8c035d7d6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -141,7 +141,7 @@ only-milvus = milvus = farm-haystack[sql,only-milvus] weaviate = - weaviate-client==3.3.3 + weaviate-client==3.6.0 only-pinecone = pinecone-client pinecone = diff --git a/test/document_stores/test_document_store.py b/test/document_stores/test_document_store.py index fbdcb3cb3..79361f065 100644 --- a/test/document_stores/test_document_store.py +++ b/test/document_stores/test_document_store.py @@ -389,14 +389,14 @@ def test_get_documents_by_id(document_store: BaseDocumentStore): def test_get_document_count(document_store: BaseDocumentStore): documents = [ - {"content": "text1", "id": "1", "meta_field_for_count": "a"}, + {"content": "text1", "id": "1", "meta_field_for_count": "c"}, {"content": "text2", "id": "2", "meta_field_for_count": "b"}, {"content": "text3", "id": "3", "meta_field_for_count": "b"}, {"content": "text4", "id": "4", "meta_field_for_count": "b"}, ] document_store.write_documents(documents) assert document_store.get_document_count() == 4 - assert document_store.get_document_count(filters={"meta_field_for_count": ["a"]}) == 1 + assert document_store.get_document_count(filters={"meta_field_for_count": ["c"]}) == 1 assert document_store.get_document_count(filters={"meta_field_for_count": ["b"]}) == 3 diff --git a/test/document_stores/test_weaviate.py b/test/document_stores/test_weaviate.py index 1c020ce85..2ad144fa1 100644 --- a/test/document_stores/test_weaviate.py +++ b/test/document_stores/test_weaviate.py @@ -97,8 +97,12 @@ def test_query_by_embedding(document_store_with_docs): @pytest.mark.parametrize("document_store_with_docs", ["weaviate"], indirect=True) def test_query(document_store_with_docs): query_text = "My name is Carla and I live in Berlin" + docs = document_store_with_docs.query(query_text) + assert len(docs) == 3 + + # BM25 retrieval WITH filters is not yet supported as of Weaviate v1.14.1 with pytest.raises(Exception): - docs = document_store_with_docs.query(query_text) + docs = document_store_with_docs.query(query_text, filters={"name": ["filename2"]}) docs = document_store_with_docs.query(filters={"name": ["filename2"]}) assert len(docs) == 1