From b20a1f874b24d76e6da510b8d721213e5cb6e7e3 Mon Sep 17 00:00:00 2001 From: tstadel <60758086+tstadel@users.noreply.github.com> Date: Fri, 25 Mar 2022 17:53:42 +0100 Subject: [PATCH] Fix sparse retrieval with filters returns results without any text-match (#2359) * use "must" instead of "should" for query-matching * Update Documentation & Code Style * fix mypy issue * fix finding of new pylint version * add test * fix test_retrieval Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- haystack/document_stores/elasticsearch.py | 4 +-- haystack/nodes/_json_schema.py | 2 +- haystack/pipelines/utils.py | 8 +++-- test/test_retriever.py | 41 +++++++++++++++++++++-- 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index bd4dc327f..eb11c8918 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -1049,9 +1049,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): "size": str(top_k), "query": { "bool": { - "should": [ - {"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}} - ] + "must": [{"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}}] } }, } diff --git a/haystack/nodes/_json_schema.py b/haystack/nodes/_json_schema.py index f02e5862f..4e878d402 100644 --- a/haystack/nodes/_json_schema.py +++ b/haystack/nodes/_json_schema.py @@ -360,7 +360,7 @@ def new_version_entry(version): def update_json_schema( - update_index: bool, destination_path: Path = JSON_SCHEMAS_PATH, index_name: Path = "haystack-pipeline.schema.json" + update_index: bool, destination_path: Path = JSON_SCHEMAS_PATH, index_name: str = "haystack-pipeline.schema.json" ): # Locate the latest schema's path latest_schema_path = destination_path / Path( diff --git a/haystack/pipelines/utils.py b/haystack/pipelines/utils.py index d89ae7515..21199c36a 100644 --- a/haystack/pipelines/utils.py +++ b/haystack/pipelines/utils.py @@ -182,13 +182,15 @@ def print_eval_report( } if metrics_filter is not None: - for metric_mode in calculated_metrics: - calculated_metrics[metric_mode] = { + calculated_metrics = { + metric_mode: { node: metrics if node not in metrics_filter else {metric: value for metric, value in metrics.items() if metric in metrics_filter[node]} - for node, metrics in calculated_metrics[metric_mode].items() + for node, metrics in node_metrics_dict.items() } + for metric_mode, node_metrics_dict in calculated_metrics.items() + } pipeline_overview = _format_pipeline_overview(calculated_metrics=calculated_metrics, graph=graph) wrong_examples_report = _format_wrong_examples_report(eval_result=eval_result, n_wrong_examples=n_wrong_examples) diff --git a/test/test_retriever.py b/test/test_retriever.py index a581e87ec..0e83cabc2 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -82,7 +82,7 @@ def test_retrieval(retriever_with_docs, document_store_with_docs): retriever_with_docs, TfidfRetriever ): # single filter - result = retriever_with_docs.retrieve(query="godzilla", filters={"name": ["filename3"]}, top_k=5) + result = retriever_with_docs.retrieve(query="Christelle", filters={"name": ["filename3"]}, top_k=5) assert len(result) == 1 assert type(result[0]) == Document assert result[0].content == "My name is Christelle and I live in Paris" @@ -90,14 +90,14 @@ def test_retrieval(retriever_with_docs, document_store_with_docs): # multiple filters result = retriever_with_docs.retrieve( - query="godzilla", filters={"name": ["filename2"], "meta_field": ["test2", "test3"]}, top_k=5 + query="Paul", filters={"name": ["filename2"], "meta_field": ["test2", "test3"]}, top_k=5 ) assert len(result) == 1 assert type(result[0]) == Document assert result[0].meta["name"] == "filename2" result = retriever_with_docs.retrieve( - query="godzilla", filters={"name": ["filename1"], "meta_field": ["test2", "test3"]}, top_k=5 + query="Carla", filters={"name": ["filename1"], "meta_field": ["test2", "test3"]}, top_k=5 ) assert len(result) == 0 @@ -479,3 +479,38 @@ def test_elasticsearch_highlight(): assert len(results[0].meta["highlighted"]) == 1 assert results[0].meta["highlighted"]["title"] == ["**Green**", "**tea** components"] + + +def test_elasticsearch_filter_must_not_increase_results(): + index = "filter_must_not_increase_results" + client = Elasticsearch() + client.indices.delete(index=index, ignore=[404]) + documents = [ + { + "content": "The green tea plant contains a range of healthy compounds that make it into the final drink", + "meta": {"content_type": "text"}, + "id": "1", + }, + { + "content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).", + "meta": {"content_type": "text"}, + "id": "2", + }, + { + "content": "Green tea also has small amounts of minerals that can benefit your health.", + "meta": {"content_type": "text"}, + "id": "3", + }, + { + "content": "Green tea does more than just keep you alert, it may also help boost brain function.", + "meta": {"content_type": "text"}, + "id": "4", + }, + ] + doc_store = ElasticsearchDocumentStore(index=index) + doc_store.write_documents(documents) + results_wo_filter = doc_store.query(query="drink") + assert len(results_wo_filter) == 1 + results_w_filter = doc_store.query(query="drink", filters={"content_type": "text"}) + assert len(results_w_filter) == 1 + doc_store.delete_index(index)