From a73717b2eab91abbc2e38cf2d5b39855fe5e19f8 Mon Sep 17 00:00:00 2001
From: tstadel <60758086+tstadel@users.noreply.github.com>
Date: Mon, 28 Mar 2022 22:10:50 +0200
Subject: [PATCH] Support conjunctive queries in sparse retrieval (#2361)
* support conjunctive queries in sparse retrieval
* fix typo
* test added
* Update Documentation & Code Style
* fix test_DeepsetCloudDocumentStore_query
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
docs/_src/api/api/document_store.md | 18 ++++++++--
docs/_src/api/api/retriever.md | 6 +++-
haystack/document_stores/base.py | 5 +++
haystack/document_stores/deepsetcloud.py | 13 ++++++-
haystack/document_stores/elasticsearch.py | 17 ++++++++-
.../haystack-pipeline-1.2.1rc0.schema.json | 10 ++++++
.../haystack-pipeline-unstable.schema.json | 10 ++++++
haystack/nodes/retriever/sparse.py | 23 ++++++++++--
haystack/utils/deepsetcloud.py | 2 ++
test/test_document_store.py | 3 +-
test/test_retriever.py | 35 +++++++++++++++++++
11 files changed, 133 insertions(+), 9 deletions(-)
diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md
index 61b69bf7d..b369d6f32 100644
--- a/docs/_src/api/api/document_store.md
+++ b/docs/_src/api/api/document_store.md
@@ -305,7 +305,7 @@ Base class for implementing Document Stores that support keyword searches.
```python
@abstractmethod
-def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@@ -382,6 +382,10 @@ operation.
- `custom_query`: Custom query to be executed.
- `index`: The name of the index in the DocumentStore from which to retrieve documents
- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+- `all_terms_must_match`: Whether all terms of the query must match the document.
+If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+Defaults to False.
@@ -762,7 +766,7 @@ Return all labels in the document store
#### query
```python
-def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@@ -903,6 +907,10 @@ You will find the highlighted output in the returned Document's meta field by ke
- `index`: The name of the index in the DocumentStore from which to retrieve documents
- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+- `all_terms_must_match`: Whether all terms of the query must match the document.
+If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+Defaults to false.
@@ -4135,7 +4143,7 @@ operation.
#### query
```python
-def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@@ -4212,6 +4220,10 @@ operation.
- `custom_query`: Custom query to be executed.
- `index`: The name of the index in the DocumentStore from which to retrieve documents
- `headers`: Custom HTTP headers to pass to requests
+- `all_terms_must_match`: Whether all terms of the query must match the document.
+If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+Defaults to False.
diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md
index 90e7db44f..bb75023bb 100644
--- a/docs/_src/api/api/retriever.md
+++ b/docs/_src/api/api/retriever.md
@@ -107,12 +107,16 @@ class ElasticsearchRetriever(BaseRetriever)
#### \_\_init\_\_
```python
-def __init__(document_store: KeywordDocumentStore, top_k: int = 10, custom_query: Optional[str] = None)
+def __init__(document_store: KeywordDocumentStore, top_k: int = 10, all_terms_must_match: bool = False, custom_query: Optional[str] = None)
```
**Arguments**:
- `document_store`: an instance of an ElasticsearchDocumentStore to retrieve documents from.
+- `all_terms_must_match`: Whether all terms of the query must match the document.
+If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+Defaults to False.
- `custom_query`: query string as per Elasticsearch DSL with a mandatory query placeholder(query).
Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
diff --git a/haystack/document_stores/base.py b/haystack/document_stores/base.py
index 6f44d024f..8c3737bb4 100644
--- a/haystack/document_stores/base.py
+++ b/haystack/document_stores/base.py
@@ -662,6 +662,7 @@ class KeywordDocumentStore(BaseDocumentStore):
custom_query: Optional[str] = None,
index: Optional[str] = None,
headers: Optional[Dict[str, str]] = None,
+ all_terms_must_match: bool = False,
) -> List[Document]:
"""
Scan through documents in DocumentStore and return a small number documents
@@ -736,6 +737,10 @@ class KeywordDocumentStore(BaseDocumentStore):
:param custom_query: Custom query to be executed.
:param index: The name of the index in the DocumentStore from which to retrieve documents
:param headers: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication)
+ :param all_terms_must_match: Whether all terms of the query must match the document.
+ If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+ Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+ Defaults to False.
"""
diff --git a/haystack/document_stores/deepsetcloud.py b/haystack/document_stores/deepsetcloud.py
index 5fdce4f28..e6ccdafe4 100644
--- a/haystack/document_stores/deepsetcloud.py
+++ b/haystack/document_stores/deepsetcloud.py
@@ -327,6 +327,7 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
custom_query: Optional[str] = None,
index: Optional[str] = None,
headers: Optional[Dict[str, str]] = None,
+ all_terms_must_match: bool = False,
) -> List[Document]:
"""
Scan through documents in DocumentStore and return a small number documents
@@ -400,9 +401,19 @@ class DeepsetCloudDocumentStore(KeywordDocumentStore):
:param custom_query: Custom query to be executed.
:param index: The name of the index in the DocumentStore from which to retrieve documents
:param headers: Custom HTTP headers to pass to requests
+ :param all_terms_must_match: Whether all terms of the query must match the document.
+ If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+ Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+ Defaults to False.
"""
doc_dicts = self.client.query(
- query=query, filters=filters, top_k=top_k, custom_query=custom_query, index=index, headers=headers
+ query=query,
+ filters=filters,
+ top_k=top_k,
+ custom_query=custom_query,
+ index=index,
+ all_terms_must_match=all_terms_must_match,
+ headers=headers,
)
docs = [Document.from_dict(doc) for doc in doc_dicts]
return docs
diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py
index eb11c8918..f107f1846 100644
--- a/haystack/document_stores/elasticsearch.py
+++ b/haystack/document_stores/elasticsearch.py
@@ -872,6 +872,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
custom_query: Optional[str] = None,
index: Optional[str] = None,
headers: Optional[Dict[str, str]] = None,
+ all_terms_must_match: bool = False,
) -> List[Document]:
"""
Scan through documents in DocumentStore and return a small number documents
@@ -1011,6 +1012,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
:param index: The name of the index in the DocumentStore from which to retrieve documents
:param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
+ :param all_terms_must_match: Whether all terms of the query must match the document.
+ If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+ Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+ Defaults to false.
"""
if index is None:
@@ -1045,11 +1050,21 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
"The query provided seems to be not a string, but an object "
f"of type {type(query)}. This can cause Elasticsearch to fail."
)
+ operator = "AND" if all_terms_must_match else "OR"
body = {
"size": str(top_k),
"query": {
"bool": {
- "must": [{"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}}]
+ "must": [
+ {
+ "multi_match": {
+ "query": query,
+ "type": "most_fields",
+ "fields": self.search_fields,
+ "operator": operator,
+ }
+ }
+ ]
}
},
}
diff --git a/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json b/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json
index e5268a5a4..7a44ebf43 100644
--- a/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json
+++ b/haystack/json-schemas/haystack-pipeline-1.2.1rc0.schema.json
@@ -1630,6 +1630,11 @@
"default": 10,
"type": "integer"
},
+ "all_terms_must_match": {
+ "title": "All Terms Must Match",
+ "default": false,
+ "type": "boolean"
+ },
"custom_query": {
"title": "Custom Query",
"type": "string"
@@ -1675,6 +1680,11 @@
"default": 10,
"type": "integer"
},
+ "all_terms_must_match": {
+ "title": "All Terms Must Match",
+ "default": false,
+ "type": "boolean"
+ },
"custom_query": {
"title": "Custom Query",
"type": "string"
diff --git a/haystack/json-schemas/haystack-pipeline-unstable.schema.json b/haystack/json-schemas/haystack-pipeline-unstable.schema.json
index 31af33b03..b85f3583e 100644
--- a/haystack/json-schemas/haystack-pipeline-unstable.schema.json
+++ b/haystack/json-schemas/haystack-pipeline-unstable.schema.json
@@ -1633,6 +1633,11 @@
"default": 10,
"type": "integer"
},
+ "all_terms_must_match": {
+ "title": "All Terms Must Match",
+ "default": false,
+ "type": "boolean"
+ },
"custom_query": {
"title": "Custom Query",
"type": "string"
@@ -1678,6 +1683,11 @@
"default": 10,
"type": "integer"
},
+ "all_terms_must_match": {
+ "title": "All Terms Must Match",
+ "default": false,
+ "type": "boolean"
+ },
"custom_query": {
"title": "Custom Query",
"type": "string"
diff --git a/haystack/nodes/retriever/sparse.py b/haystack/nodes/retriever/sparse.py
index cba1ac613..ff61b9b53 100644
--- a/haystack/nodes/retriever/sparse.py
+++ b/haystack/nodes/retriever/sparse.py
@@ -16,9 +16,19 @@ logger = logging.getLogger(__name__)
class ElasticsearchRetriever(BaseRetriever):
- def __init__(self, document_store: KeywordDocumentStore, top_k: int = 10, custom_query: Optional[str] = None):
+ def __init__(
+ self,
+ document_store: KeywordDocumentStore,
+ top_k: int = 10,
+ all_terms_must_match: bool = False,
+ custom_query: Optional[str] = None,
+ ):
"""
:param document_store: an instance of an ElasticsearchDocumentStore to retrieve documents from.
+ :param all_terms_must_match: Whether all terms of the query must match the document.
+ If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant").
+ Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant").
+ Defaults to False.
:param custom_query: query string as per Elasticsearch DSL with a mandatory query placeholder(query).
Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
@@ -91,6 +101,7 @@ class ElasticsearchRetriever(BaseRetriever):
self.document_store: KeywordDocumentStore = document_store
self.top_k = top_k
self.custom_query = custom_query
+ self.all_terms_must_match = all_terms_must_match
def retrieve(
self,
@@ -116,7 +127,15 @@ class ElasticsearchRetriever(BaseRetriever):
if index is None:
index = self.document_store.index
- documents = self.document_store.query(query, filters, top_k, self.custom_query, index, headers=headers)
+ documents = self.document_store.query(
+ query=query,
+ filters=filters,
+ top_k=top_k,
+ all_terms_must_match=self.all_terms_must_match,
+ custom_query=self.custom_query,
+ index=index,
+ headers=headers,
+ )
return documents
diff --git a/haystack/utils/deepsetcloud.py b/haystack/utils/deepsetcloud.py
index b3361caf8..30d414204 100644
--- a/haystack/utils/deepsetcloud.py
+++ b/haystack/utils/deepsetcloud.py
@@ -312,6 +312,7 @@ class IndexClient:
similarity: Optional[str] = None,
workspace: Optional[str] = None,
index: Optional[str] = None,
+ all_terms_must_match: Optional[bool] = None,
headers: dict = None,
) -> List[dict]:
index_url = self._build_index_url(workspace=workspace, index=index)
@@ -324,6 +325,7 @@ class IndexClient:
"query_emb": query_emb,
"similarity": similarity,
"return_embedding": return_embedding,
+ "all_terms_must_match": all_terms_must_match,
}
response = self.client.post(url=query_url, json=request, headers=headers)
return response.json()
diff --git a/test/test_document_store.py b/test/test_document_store.py
index facb42996..0b27c2e8d 100644
--- a/test/test_document_store.py
+++ b/test/test_document_store.py
@@ -1598,7 +1598,7 @@ def test_DeepsetCloudDocumentStore_query(deepset_cloud_document_store):
responses.add(
method=responses.POST,
url=f"{DC_API_ENDPOINT}/workspaces/default/indexes/{DC_TEST_INDEX}/documents-query",
- match=[matchers.json_params_matcher({"query": "winterfell", "top_k": 50})],
+ match=[matchers.json_params_matcher({"query": "winterfell", "top_k": 50, "all_terms_must_match": False})],
status=200,
body=query_winterfell_response,
)
@@ -1612,6 +1612,7 @@ def test_DeepsetCloudDocumentStore_query(deepset_cloud_document_store):
"query": "winterfell",
"top_k": 50,
"filters": {"file_id": [query_winterfell_docs[0]["meta"]["file_id"]]},
+ "all_terms_must_match": False,
}
)
],
diff --git a/test/test_retriever.py b/test/test_retriever.py
index 0e83cabc2..113dbe613 100644
--- a/test/test_retriever.py
+++ b/test/test_retriever.py
@@ -514,3 +514,38 @@ def test_elasticsearch_filter_must_not_increase_results():
results_w_filter = doc_store.query(query="drink", filters={"content_type": "text"})
assert len(results_w_filter) == 1
doc_store.delete_index(index)
+
+
+def test_elasticsearch_all_terms_must_match():
+ index = "all_terms_must_match"
+ client = Elasticsearch()
+ client.indices.delete(index=index, ignore=[404])
+ documents = [
+ {
+ "content": "The green tea plant contains a range of healthy compounds that make it into the final drink",
+ "meta": {"content_type": "text"},
+ "id": "1",
+ },
+ {
+ "content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).",
+ "meta": {"content_type": "text"},
+ "id": "2",
+ },
+ {
+ "content": "Green tea also has small amounts of minerals that can benefit your health.",
+ "meta": {"content_type": "text"},
+ "id": "3",
+ },
+ {
+ "content": "Green tea does more than just keep you alert, it may also help boost brain function.",
+ "meta": {"content_type": "text"},
+ "id": "4",
+ },
+ ]
+ doc_store = ElasticsearchDocumentStore(index=index)
+ doc_store.write_documents(documents)
+ results_wo_all_terms_must_match = doc_store.query(query="drink green tea")
+ assert len(results_wo_all_terms_must_match) == 4
+ results_w_all_terms_must_match = doc_store.query(query="drink green tea", all_terms_must_match=True)
+ assert len(results_w_all_terms_must_match) == 1
+ doc_store.delete_index(index)