Fix sparse retrieval with filters returns results without any text-match (#2359)

* use "must" instead of "should" for query-matching * Update Documentation & Code Style * fix mypy issue * fix finding of new pylint version * add test * fix test_retrieval Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2025-11-02 18:59:28 +00:00 · 2022-03-25 17:53:42 +01:00 · 2022-03-25 17:53:42 +01:00 · b20a1f874b
commit b20a1f874b
parent a398094243
4 changed files with 45 additions and 10 deletions
--- a/haystack/document_stores/elasticsearch.py
+++ b/haystack/document_stores/elasticsearch.py
@ -1049,9 +1049,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
                "size": str(top_k),
                "query": {
                    "bool": {
-                        "should": [
-                            {"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}}
-                        ]
+                        "must": [{"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}}]
                    }
                },
            }
--- a/haystack/nodes/_json_schema.py
+++ b/haystack/nodes/_json_schema.py
@ -360,7 +360,7 @@ def new_version_entry(version):


 def update_json_schema(
-    update_index: bool, destination_path: Path = JSON_SCHEMAS_PATH, index_name: Path = "haystack-pipeline.schema.json"
+    update_index: bool, destination_path: Path = JSON_SCHEMAS_PATH, index_name: str = "haystack-pipeline.schema.json"
 ):
    # Locate the latest schema's path
    latest_schema_path = destination_path / Path(
--- a/haystack/pipelines/utils.py
+++ b/haystack/pipelines/utils.py
@ -182,13 +182,15 @@ def print_eval_report(
    }

    if metrics_filter is not None:
-        for metric_mode in calculated_metrics:
-            calculated_metrics[metric_mode] = {
+        calculated_metrics = {
+            metric_mode: {
                node: metrics
                if node not in metrics_filter
                else {metric: value for metric, value in metrics.items() if metric in metrics_filter[node]}
-                for node, metrics in calculated_metrics[metric_mode].items()
+                for node, metrics in node_metrics_dict.items()
            }
+            for metric_mode, node_metrics_dict in calculated_metrics.items()
+        }

    pipeline_overview = _format_pipeline_overview(calculated_metrics=calculated_metrics, graph=graph)
    wrong_examples_report = _format_wrong_examples_report(eval_result=eval_result, n_wrong_examples=n_wrong_examples)
--- a/test/test_retriever.py
+++ b/test/test_retriever.py
@ -82,7 +82,7 @@ def test_retrieval(retriever_with_docs, document_store_with_docs):
        retriever_with_docs, TfidfRetriever
    ):
        # single filter
-        result = retriever_with_docs.retrieve(query="godzilla", filters={"name": ["filename3"]}, top_k=5)
+        result = retriever_with_docs.retrieve(query="Christelle", filters={"name": ["filename3"]}, top_k=5)
        assert len(result) == 1
        assert type(result[0]) == Document
        assert result[0].content == "My name is Christelle and I live in Paris"
@ -90,14 +90,14 @@ def test_retrieval(retriever_with_docs, document_store_with_docs):

        # multiple filters
        result = retriever_with_docs.retrieve(
-            query="godzilla", filters={"name": ["filename2"], "meta_field": ["test2", "test3"]}, top_k=5
+            query="Paul", filters={"name": ["filename2"], "meta_field": ["test2", "test3"]}, top_k=5
        )
        assert len(result) == 1
        assert type(result[0]) == Document
        assert result[0].meta["name"] == "filename2"

        result = retriever_with_docs.retrieve(
-            query="godzilla", filters={"name": ["filename1"], "meta_field": ["test2", "test3"]}, top_k=5
+            query="Carla", filters={"name": ["filename1"], "meta_field": ["test2", "test3"]}, top_k=5
        )
        assert len(result) == 0

@ -479,3 +479,38 @@ def test_elasticsearch_highlight():

    assert len(results[0].meta["highlighted"]) == 1
    assert results[0].meta["highlighted"]["title"] == ["**Green**", "**tea** components"]
+
+
+def test_elasticsearch_filter_must_not_increase_results():
+    index = "filter_must_not_increase_results"
+    client = Elasticsearch()
+    client.indices.delete(index=index, ignore=[404])
+    documents = [
+        {
+            "content": "The green tea plant contains a range of healthy compounds that make it into the final drink",
+            "meta": {"content_type": "text"},
+            "id": "1",
+        },
+        {
+            "content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).",
+            "meta": {"content_type": "text"},
+            "id": "2",
+        },
+        {
+            "content": "Green tea also has small amounts of minerals that can benefit your health.",
+            "meta": {"content_type": "text"},
+            "id": "3",
+        },
+        {
+            "content": "Green tea does more than just keep you alert, it may also help boost brain function.",
+            "meta": {"content_type": "text"},
+            "id": "4",
+        },
+    ]
+    doc_store = ElasticsearchDocumentStore(index=index)
+    doc_store.write_documents(documents)
+    results_wo_filter = doc_store.query(query="drink")
+    assert len(results_wo_filter) == 1
+    results_w_filter = doc_store.query(query="drink", filters={"content_type": "text"})
+    assert len(results_w_filter) == 1
+    doc_store.delete_index(index)