mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-29 16:59:47 +00:00
Fix sparse retrieval with filters returns results without any text-match (#2359)
* use "must" instead of "should" for query-matching * Update Documentation & Code Style * fix mypy issue * fix finding of new pylint version * add test * fix test_retrieval Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
a398094243
commit
b20a1f874b
@ -1049,9 +1049,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
"size": str(top_k),
|
||||
"query": {
|
||||
"bool": {
|
||||
"should": [
|
||||
{"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}}
|
||||
]
|
||||
"must": [{"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}}]
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
@ -360,7 +360,7 @@ def new_version_entry(version):
|
||||
|
||||
|
||||
def update_json_schema(
|
||||
update_index: bool, destination_path: Path = JSON_SCHEMAS_PATH, index_name: Path = "haystack-pipeline.schema.json"
|
||||
update_index: bool, destination_path: Path = JSON_SCHEMAS_PATH, index_name: str = "haystack-pipeline.schema.json"
|
||||
):
|
||||
# Locate the latest schema's path
|
||||
latest_schema_path = destination_path / Path(
|
||||
|
||||
@ -182,13 +182,15 @@ def print_eval_report(
|
||||
}
|
||||
|
||||
if metrics_filter is not None:
|
||||
for metric_mode in calculated_metrics:
|
||||
calculated_metrics[metric_mode] = {
|
||||
calculated_metrics = {
|
||||
metric_mode: {
|
||||
node: metrics
|
||||
if node not in metrics_filter
|
||||
else {metric: value for metric, value in metrics.items() if metric in metrics_filter[node]}
|
||||
for node, metrics in calculated_metrics[metric_mode].items()
|
||||
for node, metrics in node_metrics_dict.items()
|
||||
}
|
||||
for metric_mode, node_metrics_dict in calculated_metrics.items()
|
||||
}
|
||||
|
||||
pipeline_overview = _format_pipeline_overview(calculated_metrics=calculated_metrics, graph=graph)
|
||||
wrong_examples_report = _format_wrong_examples_report(eval_result=eval_result, n_wrong_examples=n_wrong_examples)
|
||||
|
||||
@ -82,7 +82,7 @@ def test_retrieval(retriever_with_docs, document_store_with_docs):
|
||||
retriever_with_docs, TfidfRetriever
|
||||
):
|
||||
# single filter
|
||||
result = retriever_with_docs.retrieve(query="godzilla", filters={"name": ["filename3"]}, top_k=5)
|
||||
result = retriever_with_docs.retrieve(query="Christelle", filters={"name": ["filename3"]}, top_k=5)
|
||||
assert len(result) == 1
|
||||
assert type(result[0]) == Document
|
||||
assert result[0].content == "My name is Christelle and I live in Paris"
|
||||
@ -90,14 +90,14 @@ def test_retrieval(retriever_with_docs, document_store_with_docs):
|
||||
|
||||
# multiple filters
|
||||
result = retriever_with_docs.retrieve(
|
||||
query="godzilla", filters={"name": ["filename2"], "meta_field": ["test2", "test3"]}, top_k=5
|
||||
query="Paul", filters={"name": ["filename2"], "meta_field": ["test2", "test3"]}, top_k=5
|
||||
)
|
||||
assert len(result) == 1
|
||||
assert type(result[0]) == Document
|
||||
assert result[0].meta["name"] == "filename2"
|
||||
|
||||
result = retriever_with_docs.retrieve(
|
||||
query="godzilla", filters={"name": ["filename1"], "meta_field": ["test2", "test3"]}, top_k=5
|
||||
query="Carla", filters={"name": ["filename1"], "meta_field": ["test2", "test3"]}, top_k=5
|
||||
)
|
||||
assert len(result) == 0
|
||||
|
||||
@ -479,3 +479,38 @@ def test_elasticsearch_highlight():
|
||||
|
||||
assert len(results[0].meta["highlighted"]) == 1
|
||||
assert results[0].meta["highlighted"]["title"] == ["**Green**", "**tea** components"]
|
||||
|
||||
|
||||
def test_elasticsearch_filter_must_not_increase_results():
|
||||
index = "filter_must_not_increase_results"
|
||||
client = Elasticsearch()
|
||||
client.indices.delete(index=index, ignore=[404])
|
||||
documents = [
|
||||
{
|
||||
"content": "The green tea plant contains a range of healthy compounds that make it into the final drink",
|
||||
"meta": {"content_type": "text"},
|
||||
"id": "1",
|
||||
},
|
||||
{
|
||||
"content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).",
|
||||
"meta": {"content_type": "text"},
|
||||
"id": "2",
|
||||
},
|
||||
{
|
||||
"content": "Green tea also has small amounts of minerals that can benefit your health.",
|
||||
"meta": {"content_type": "text"},
|
||||
"id": "3",
|
||||
},
|
||||
{
|
||||
"content": "Green tea does more than just keep you alert, it may also help boost brain function.",
|
||||
"meta": {"content_type": "text"},
|
||||
"id": "4",
|
||||
},
|
||||
]
|
||||
doc_store = ElasticsearchDocumentStore(index=index)
|
||||
doc_store.write_documents(documents)
|
||||
results_wo_filter = doc_store.query(query="drink")
|
||||
assert len(results_wo_filter) == 1
|
||||
results_w_filter = doc_store.query(query="drink", filters={"content_type": "text"})
|
||||
assert len(results_w_filter) == 1
|
||||
doc_store.delete_index(index)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user