From a73717b2eab91abbc2e38cf2d5b39855fe5e19f8 Mon Sep 17 00:00:00 2001 From: tstadel <60758086+tstadel@users.noreply.github.com> Date: Mon, 28 Mar 2022 22:10:50 +0200 Subject: [PATCH] Support conjunctive queries in sparse retrieval (#2361) * support conjunctive queries in sparse retrieval * fix typo * test added * Update Documentation & Code Style * fix test_DeepsetCloudDocumentStore_query Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/_src/api/api/document_store.md | 18 ++++++++-- docs/_src/api/api/retriever.md | 6 +++- haystack/document_stores/base.py | 5 +++ haystack/document_stores/deepsetcloud.py | 13 ++++++- haystack/document_stores/elasticsearch.py | 17 ++++++++- .../haystack-pipeline-1.2.1rc0.schema.json | 10 ++++++ .../haystack-pipeline-unstable.schema.json | 10 ++++++ haystack/nodes/retriever/sparse.py | 23 ++++++++++-- haystack/utils/deepsetcloud.py | 2 ++ test/test_document_store.py | 3 +- test/test_retriever.py | 35 +++++++++++++++++++ 11 files changed, 133 insertions(+), 9 deletions(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 61b69bf7d..b369d6f32 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -305,7 +305,7 @@ Base class for implementing Document Stores that support keyword searches. ```python @abstractmethod -def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -382,6 +382,10 @@ operation. - `custom_query`: Custom query to be executed. - `index`: The name of the index in the DocumentStore from which to retrieve documents - `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) +- `all_terms_must_match`: Whether all terms of the query must match the document. +If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). +Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). +Defaults to False. @@ -762,7 +766,7 @@ Return all labels in the document store #### query ```python -def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -903,6 +907,10 @@ You will find the highlighted output in the returned Document's meta field by ke - `index`: The name of the index in the DocumentStore from which to retrieve documents - `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. +- `all_terms_must_match`: Whether all terms of the query must match the document. +If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). +Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). +Defaults to false. @@ -4135,7 +4143,7 @@ operation. #### query ```python -def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -4212,6 +4220,10 @@ operation. - `custom_query`: Custom query to be executed. - `index`: The name of the index in the DocumentStore from which to retrieve documents - `headers`: Custom HTTP headers to pass to requests +- `all_terms_must_match`: Whether all terms of the query must match the document. +If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). +Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). +Defaults to False. diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index 90e7db44f..bb75023bb 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -107,12 +107,16 @@ class ElasticsearchRetriever(BaseRetriever) #### \_\_init\_\_ ```python -def __init__(document_store: KeywordDocumentStore, top_k: int = 10, custom_query: Optional[str] = None) +def __init__(document_store: KeywordDocumentStore, top_k: int = 10, all_terms_must_match: bool = False, custom_query: Optional[str] = None) ``` **Arguments**: - `document_store`: an instance of an ElasticsearchDocumentStore to retrieve documents from. +- `all_terms_must_match`: Whether all terms of the query must match the document. +If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). +Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). +Defaults to False. - `custom_query`: query string as per Elasticsearch DSL with a mandatory query placeholder(query). Optionally, ES `filter` clause can be added where the values of `terms` are placeholders that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..) diff --git a/haystack/document_stores/base.py b/haystack/document_stores/base.py index 6f44d024f..8c3737bb4 100644 --- a/haystack/document_stores/base.py +++ b/haystack/document_stores/base.py @@ -662,6 +662,7 @@ class KeywordDocumentStore(BaseDocumentStore): custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, + all_terms_must_match: bool = False, ) -> List[Document]: """ Scan through documents in DocumentStore and return a small number documents @@ -736,6 +737,10 @@ class KeywordDocumentStore(BaseDocumentStore): :param custom_query: Custom query to be executed. :param index: The name of the index in the DocumentStore from which to retrieve documents :param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + :param all_terms_must_match: Whether all terms of the query must match the document. + If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). + Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). + Defaults to False. """ diff --git a/haystack/document_stores/deepsetcloud.py b/haystack/document_stores/deepsetcloud.py index 5fdce4f28..e6ccdafe4 100644 --- a/haystack/document_stores/deepsetcloud.py +++ b/haystack/document_stores/deepsetcloud.py @@ -327,6 +327,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, + all_terms_must_match: bool = False, ) -> List[Document]: """ Scan through documents in DocumentStore and return a small number documents @@ -400,9 +401,19 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore): :param custom_query: Custom query to be executed. :param index: The name of the index in the DocumentStore from which to retrieve documents :param headers: Custom HTTP headers to pass to requests + :param all_terms_must_match: Whether all terms of the query must match the document. + If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). + Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). + Defaults to False. """ doc_dicts = self.client.query( - query=query, filters=filters, top_k=top_k, custom_query=custom_query, index=index, headers=headers + query=query, + filters=filters, + top_k=top_k, + custom_query=custom_query, + index=index, + all_terms_must_match=all_terms_must_match, + headers=headers, ) docs = [Document.from_dict(doc) for doc in doc_dicts] return docs diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index eb11c8918..f107f1846 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -872,6 +872,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, + all_terms_must_match: bool = False, ) -> List[Document]: """ Scan through documents in DocumentStore and return a small number documents @@ -1011,6 +1012,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): :param index: The name of the index in the DocumentStore from which to retrieve documents :param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + :param all_terms_must_match: Whether all terms of the query must match the document. + If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). + Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). + Defaults to false. """ if index is None: @@ -1045,11 +1050,21 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): "The query provided seems to be not a string, but an object " f"of type {type(query)}. This can cause Elasticsearch to fail." ) + operator = "AND" if all_terms_must_match else "OR" body = { "size": str(top_k), "query": { "bool": { - "must": [{"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}}] + "must": [ + { + "multi_match": { + "query": query, + "type": "most_fields", + "fields": self.search_fields, + "operator": operator, + } + } + ] } }, } diff --git a/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json b/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json index e5268a5a4..7a44ebf43 100644 --- a/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json +++ b/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json @@ -1630,6 +1630,11 @@ "default": 10, "type": "integer" }, + "all_terms_must_match": { + "title": "All Terms Must Match", + "default": false, + "type": "boolean" + }, "custom_query": { "title": "Custom Query", "type": "string" @@ -1675,6 +1680,11 @@ "default": 10, "type": "integer" }, + "all_terms_must_match": { + "title": "All Terms Must Match", + "default": false, + "type": "boolean" + }, "custom_query": { "title": "Custom Query", "type": "string" diff --git a/haystack/json-schemas/haystack-pipeline-unstable.schema.json b/haystack/json-schemas/haystack-pipeline-unstable.schema.json index 31af33b03..b85f3583e 100644 --- a/haystack/json-schemas/haystack-pipeline-unstable.schema.json +++ b/haystack/json-schemas/haystack-pipeline-unstable.schema.json @@ -1633,6 +1633,11 @@ "default": 10, "type": "integer" }, + "all_terms_must_match": { + "title": "All Terms Must Match", + "default": false, + "type": "boolean" + }, "custom_query": { "title": "Custom Query", "type": "string" @@ -1678,6 +1683,11 @@ "default": 10, "type": "integer" }, + "all_terms_must_match": { + "title": "All Terms Must Match", + "default": false, + "type": "boolean" + }, "custom_query": { "title": "Custom Query", "type": "string" diff --git a/haystack/nodes/retriever/sparse.py b/haystack/nodes/retriever/sparse.py index cba1ac613..ff61b9b53 100644 --- a/haystack/nodes/retriever/sparse.py +++ b/haystack/nodes/retriever/sparse.py @@ -16,9 +16,19 @@ logger = logging.getLogger(__name__) class ElasticsearchRetriever(BaseRetriever): - def __init__(self, document_store: KeywordDocumentStore, top_k: int = 10, custom_query: Optional[str] = None): + def __init__( + self, + document_store: KeywordDocumentStore, + top_k: int = 10, + all_terms_must_match: bool = False, + custom_query: Optional[str] = None, + ): """ :param document_store: an instance of an ElasticsearchDocumentStore to retrieve documents from. + :param all_terms_must_match: Whether all terms of the query must match the document. + If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). + Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). + Defaults to False. :param custom_query: query string as per Elasticsearch DSL with a mandatory query placeholder(query). Optionally, ES `filter` clause can be added where the values of `terms` are placeholders @@ -91,6 +101,7 @@ class ElasticsearchRetriever(BaseRetriever): self.document_store: KeywordDocumentStore = document_store self.top_k = top_k self.custom_query = custom_query + self.all_terms_must_match = all_terms_must_match def retrieve( self, @@ -116,7 +127,15 @@ class ElasticsearchRetriever(BaseRetriever): if index is None: index = self.document_store.index - documents = self.document_store.query(query, filters, top_k, self.custom_query, index, headers=headers) + documents = self.document_store.query( + query=query, + filters=filters, + top_k=top_k, + all_terms_must_match=self.all_terms_must_match, + custom_query=self.custom_query, + index=index, + headers=headers, + ) return documents diff --git a/haystack/utils/deepsetcloud.py b/haystack/utils/deepsetcloud.py index b3361caf8..30d414204 100644 --- a/haystack/utils/deepsetcloud.py +++ b/haystack/utils/deepsetcloud.py @@ -312,6 +312,7 @@ class IndexClient: similarity: Optional[str] = None, workspace: Optional[str] = None, index: Optional[str] = None, + all_terms_must_match: Optional[bool] = None, headers: dict = None, ) -> List[dict]: index_url = self._build_index_url(workspace=workspace, index=index) @@ -324,6 +325,7 @@ class IndexClient: "query_emb": query_emb, "similarity": similarity, "return_embedding": return_embedding, + "all_terms_must_match": all_terms_must_match, } response = self.client.post(url=query_url, json=request, headers=headers) return response.json() diff --git a/test/test_document_store.py b/test/test_document_store.py index facb42996..0b27c2e8d 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -1598,7 +1598,7 @@ def test_DeepsetCloudDocumentStore_query(deepset_cloud_document_store): responses.add( method=responses.POST, url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query", - match=[matchers.json_params_matcher({"query": "winterfell", "top_k": 50})], + match=[matchers.json_params_matcher({"query": "winterfell", "top_k": 50, "all_terms_must_match": False})], status=200, body=query_winterfell_response, ) @@ -1612,6 +1612,7 @@ def test_DeepsetCloudDocumentStore_query(deepset_cloud_document_store): "query": "winterfell", "top_k": 50, "filters": {"file_id": [query_winterfell_docs[0]["meta"]["file_id"]]}, + "all_terms_must_match": False, } ) ], diff --git a/test/test_retriever.py b/test/test_retriever.py index 0e83cabc2..113dbe613 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -514,3 +514,38 @@ def test_elasticsearch_filter_must_not_increase_results(): results_w_filter = doc_store.query(query="drink", filters={"content_type": "text"}) assert len(results_w_filter) == 1 doc_store.delete_index(index) + + +def test_elasticsearch_all_terms_must_match(): + index = "all_terms_must_match" + client = Elasticsearch() + client.indices.delete(index=index, ignore=[404]) + documents = [ + { + "content": "The green tea plant contains a range of healthy compounds that make it into the final drink", + "meta": {"content_type": "text"}, + "id": "1", + }, + { + "content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).", + "meta": {"content_type": "text"}, + "id": "2", + }, + { + "content": "Green tea also has small amounts of minerals that can benefit your health.", + "meta": {"content_type": "text"}, + "id": "3", + }, + { + "content": "Green tea does more than just keep you alert, it may also help boost brain function.", + "meta": {"content_type": "text"}, + "id": "4", + }, + ] + doc_store = ElasticsearchDocumentStore(index=index) + doc_store.write_documents(documents) + results_wo_all_terms_must_match = doc_store.query(query="drink green tea") + assert len(results_wo_all_terms_must_match) == 4 + results_w_all_terms_must_match = doc_store.query(query="drink green tea", all_terms_must_match=True) + assert len(results_w_all_terms_must_match) == 1 + doc_store.delete_index(index)