mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 09:49:48 +00:00 
			
		
		
		
	Fix sparse retrieval with filters returns results without any text-match (#2359)
* use "must" instead of "should" for query-matching * Update Documentation & Code Style * fix mypy issue * fix finding of new pylint version * add test * fix test_retrieval Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									a398094243
								
							
						
					
					
						commit
						b20a1f874b
					
				| @ -1049,9 +1049,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): | ||||
|                 "size": str(top_k), | ||||
|                 "query": { | ||||
|                     "bool": { | ||||
|                         "should": [ | ||||
|                             {"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}} | ||||
|                         ] | ||||
|                         "must": [{"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}}] | ||||
|                     } | ||||
|                 }, | ||||
|             } | ||||
|  | ||||
| @ -360,7 +360,7 @@ def new_version_entry(version): | ||||
| 
 | ||||
| 
 | ||||
| def update_json_schema( | ||||
|     update_index: bool, destination_path: Path = JSON_SCHEMAS_PATH, index_name: Path = "haystack-pipeline.schema.json" | ||||
|     update_index: bool, destination_path: Path = JSON_SCHEMAS_PATH, index_name: str = "haystack-pipeline.schema.json" | ||||
| ): | ||||
|     # Locate the latest schema's path | ||||
|     latest_schema_path = destination_path / Path( | ||||
|  | ||||
| @ -182,13 +182,15 @@ def print_eval_report( | ||||
|     } | ||||
| 
 | ||||
|     if metrics_filter is not None: | ||||
|         for metric_mode in calculated_metrics: | ||||
|             calculated_metrics[metric_mode] = { | ||||
|         calculated_metrics = { | ||||
|             metric_mode: { | ||||
|                 node: metrics | ||||
|                 if node not in metrics_filter | ||||
|                 else {metric: value for metric, value in metrics.items() if metric in metrics_filter[node]} | ||||
|                 for node, metrics in calculated_metrics[metric_mode].items() | ||||
|                 for node, metrics in node_metrics_dict.items() | ||||
|             } | ||||
|             for metric_mode, node_metrics_dict in calculated_metrics.items() | ||||
|         } | ||||
| 
 | ||||
|     pipeline_overview = _format_pipeline_overview(calculated_metrics=calculated_metrics, graph=graph) | ||||
|     wrong_examples_report = _format_wrong_examples_report(eval_result=eval_result, n_wrong_examples=n_wrong_examples) | ||||
|  | ||||
| @ -82,7 +82,7 @@ def test_retrieval(retriever_with_docs, document_store_with_docs): | ||||
|         retriever_with_docs, TfidfRetriever | ||||
|     ): | ||||
|         # single filter | ||||
|         result = retriever_with_docs.retrieve(query="godzilla", filters={"name": ["filename3"]}, top_k=5) | ||||
|         result = retriever_with_docs.retrieve(query="Christelle", filters={"name": ["filename3"]}, top_k=5) | ||||
|         assert len(result) == 1 | ||||
|         assert type(result[0]) == Document | ||||
|         assert result[0].content == "My name is Christelle and I live in Paris" | ||||
| @ -90,14 +90,14 @@ def test_retrieval(retriever_with_docs, document_store_with_docs): | ||||
| 
 | ||||
|         # multiple filters | ||||
|         result = retriever_with_docs.retrieve( | ||||
|             query="godzilla", filters={"name": ["filename2"], "meta_field": ["test2", "test3"]}, top_k=5 | ||||
|             query="Paul", filters={"name": ["filename2"], "meta_field": ["test2", "test3"]}, top_k=5 | ||||
|         ) | ||||
|         assert len(result) == 1 | ||||
|         assert type(result[0]) == Document | ||||
|         assert result[0].meta["name"] == "filename2" | ||||
| 
 | ||||
|         result = retriever_with_docs.retrieve( | ||||
|             query="godzilla", filters={"name": ["filename1"], "meta_field": ["test2", "test3"]}, top_k=5 | ||||
|             query="Carla", filters={"name": ["filename1"], "meta_field": ["test2", "test3"]}, top_k=5 | ||||
|         ) | ||||
|         assert len(result) == 0 | ||||
| 
 | ||||
| @ -479,3 +479,38 @@ def test_elasticsearch_highlight(): | ||||
| 
 | ||||
|     assert len(results[0].meta["highlighted"]) == 1 | ||||
|     assert results[0].meta["highlighted"]["title"] == ["**Green**", "**tea** components"] | ||||
| 
 | ||||
| 
 | ||||
| def test_elasticsearch_filter_must_not_increase_results(): | ||||
|     index = "filter_must_not_increase_results" | ||||
|     client = Elasticsearch() | ||||
|     client.indices.delete(index=index, ignore=[404]) | ||||
|     documents = [ | ||||
|         { | ||||
|             "content": "The green tea plant contains a range of healthy compounds that make it into the final drink", | ||||
|             "meta": {"content_type": "text"}, | ||||
|             "id": "1", | ||||
|         }, | ||||
|         { | ||||
|             "content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).", | ||||
|             "meta": {"content_type": "text"}, | ||||
|             "id": "2", | ||||
|         }, | ||||
|         { | ||||
|             "content": "Green tea also has small amounts of minerals that can benefit your health.", | ||||
|             "meta": {"content_type": "text"}, | ||||
|             "id": "3", | ||||
|         }, | ||||
|         { | ||||
|             "content": "Green tea does more than just keep you alert, it may also help boost brain function.", | ||||
|             "meta": {"content_type": "text"}, | ||||
|             "id": "4", | ||||
|         }, | ||||
|     ] | ||||
|     doc_store = ElasticsearchDocumentStore(index=index) | ||||
|     doc_store.write_documents(documents) | ||||
|     results_wo_filter = doc_store.query(query="drink") | ||||
|     assert len(results_wo_filter) == 1 | ||||
|     results_w_filter = doc_store.query(query="drink", filters={"content_type": "text"}) | ||||
|     assert len(results_w_filter) == 1 | ||||
|     doc_store.delete_index(index) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 tstadel
						tstadel