From b20a1f874b24d76e6da510b8d721213e5cb6e7e3 Mon Sep 17 00:00:00 2001
From: tstadel <60758086+tstadel@users.noreply.github.com>
Date: Fri, 25 Mar 2022 17:53:42 +0100
Subject: [PATCH] Fix sparse retrieval with filters returns results without any
 text-match (#2359)

* use "must" instead of "should" for query-matching

* Update Documentation & Code Style

* fix mypy issue

* fix finding of new pylint version

* add test

* fix test_retrieval

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 haystack/document_stores/elasticsearch.py |  4 +--
 haystack/nodes/_json_schema.py            |  2 +-
 haystack/pipelines/utils.py               |  8 +++--
 test/test_retriever.py                    | 41 +++++++++++++++++++++--
 4 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py
index bd4dc327f..eb11c8918 100644
--- a/haystack/document_stores/elasticsearch.py
+++ b/haystack/document_stores/elasticsearch.py
@@ -1049,9 +1049,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
                 "size": str(top_k),
                 "query": {
                     "bool": {
-                        "should": [
-                            {"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}}
-                        ]
+                        "must": [{"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}}]
                     }
                 },
             }
diff --git a/haystack/nodes/_json_schema.py b/haystack/nodes/_json_schema.py
index f02e5862f..4e878d402 100644
--- a/haystack/nodes/_json_schema.py
+++ b/haystack/nodes/_json_schema.py
@@ -360,7 +360,7 @@ def new_version_entry(version):
 
 
 def update_json_schema(
-    update_index: bool, destination_path: Path = JSON_SCHEMAS_PATH, index_name: Path = "haystack-pipeline.schema.json"
+    update_index: bool, destination_path: Path = JSON_SCHEMAS_PATH, index_name: str = "haystack-pipeline.schema.json"
 ):
     # Locate the latest schema's path
     latest_schema_path = destination_path / Path(
diff --git a/haystack/pipelines/utils.py b/haystack/pipelines/utils.py
index d89ae7515..21199c36a 100644
--- a/haystack/pipelines/utils.py
+++ b/haystack/pipelines/utils.py
@@ -182,13 +182,15 @@ def print_eval_report(
     }
 
     if metrics_filter is not None:
-        for metric_mode in calculated_metrics:
-            calculated_metrics[metric_mode] = {
+        calculated_metrics = {
+            metric_mode: {
                 node: metrics
                 if node not in metrics_filter
                 else {metric: value for metric, value in metrics.items() if metric in metrics_filter[node]}
-                for node, metrics in calculated_metrics[metric_mode].items()
+                for node, metrics in node_metrics_dict.items()
             }
+            for metric_mode, node_metrics_dict in calculated_metrics.items()
+        }
 
     pipeline_overview = _format_pipeline_overview(calculated_metrics=calculated_metrics, graph=graph)
     wrong_examples_report = _format_wrong_examples_report(eval_result=eval_result, n_wrong_examples=n_wrong_examples)
diff --git a/test/test_retriever.py b/test/test_retriever.py
index a581e87ec..0e83cabc2 100644
--- a/test/test_retriever.py
+++ b/test/test_retriever.py
@@ -82,7 +82,7 @@ def test_retrieval(retriever_with_docs, document_store_with_docs):
         retriever_with_docs, TfidfRetriever
     ):
         # single filter
-        result = retriever_with_docs.retrieve(query="godzilla", filters={"name": ["filename3"]}, top_k=5)
+        result = retriever_with_docs.retrieve(query="Christelle", filters={"name": ["filename3"]}, top_k=5)
         assert len(result) == 1
         assert type(result[0]) == Document
         assert result[0].content == "My name is Christelle and I live in Paris"
@@ -90,14 +90,14 @@ def test_retrieval(retriever_with_docs, document_store_with_docs):
 
         # multiple filters
         result = retriever_with_docs.retrieve(
-            query="godzilla", filters={"name": ["filename2"], "meta_field": ["test2", "test3"]}, top_k=5
+            query="Paul", filters={"name": ["filename2"], "meta_field": ["test2", "test3"]}, top_k=5
         )
         assert len(result) == 1
         assert type(result[0]) == Document
         assert result[0].meta["name"] == "filename2"
 
         result = retriever_with_docs.retrieve(
-            query="godzilla", filters={"name": ["filename1"], "meta_field": ["test2", "test3"]}, top_k=5
+            query="Carla", filters={"name": ["filename1"], "meta_field": ["test2", "test3"]}, top_k=5
         )
         assert len(result) == 0
 
@@ -479,3 +479,38 @@ def test_elasticsearch_highlight():
 
     assert len(results[0].meta["highlighted"]) == 1
     assert results[0].meta["highlighted"]["title"] == ["**Green**", "**tea** components"]
+
+
+def test_elasticsearch_filter_must_not_increase_results():
+    index = "filter_must_not_increase_results"
+    client = Elasticsearch()
+    client.indices.delete(index=index, ignore=[404])
+    documents = [
+        {
+            "content": "The green tea plant contains a range of healthy compounds that make it into the final drink",
+            "meta": {"content_type": "text"},
+            "id": "1",
+        },
+        {
+            "content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).",
+            "meta": {"content_type": "text"},
+            "id": "2",
+        },
+        {
+            "content": "Green tea also has small amounts of minerals that can benefit your health.",
+            "meta": {"content_type": "text"},
+            "id": "3",
+        },
+        {
+            "content": "Green tea does more than just keep you alert, it may also help boost brain function.",
+            "meta": {"content_type": "text"},
+            "id": "4",
+        },
+    ]
+    doc_store = ElasticsearchDocumentStore(index=index)
+    doc_store.write_documents(documents)
+    results_wo_filter = doc_store.query(query="drink")
+    assert len(results_wo_filter) == 1
+    results_w_filter = doc_store.query(query="drink", filters={"content_type": "text"})
+    assert len(results_w_filter) == 1
+    doc_store.delete_index(index)