Fix sparse retrieval with filters returns results without any text-match (#2359)

* use "must" instead of "should" for query-matching

* Update Documentation & Code Style

* fix mypy issue

* fix finding of new pylint version

* add test

* fix test_retrieval

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
tstadel 2022-03-25 17:53:42 +01:00 committed by GitHub
parent a398094243
commit b20a1f874b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 45 additions and 10 deletions

View File

@ -1049,9 +1049,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
"size": str(top_k),
"query": {
"bool": {
"should": [
{"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}}
]
"must": [{"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}}]
}
},
}

View File

@ -360,7 +360,7 @@ def new_version_entry(version):
def update_json_schema(
update_index: bool, destination_path: Path = JSON_SCHEMAS_PATH, index_name: Path = "haystack-pipeline.schema.json"
update_index: bool, destination_path: Path = JSON_SCHEMAS_PATH, index_name: str = "haystack-pipeline.schema.json"
):
# Locate the latest schema's path
latest_schema_path = destination_path / Path(

View File

@ -182,13 +182,15 @@ def print_eval_report(
}
if metrics_filter is not None:
for metric_mode in calculated_metrics:
calculated_metrics[metric_mode] = {
calculated_metrics = {
metric_mode: {
node: metrics
if node not in metrics_filter
else {metric: value for metric, value in metrics.items() if metric in metrics_filter[node]}
for node, metrics in calculated_metrics[metric_mode].items()
for node, metrics in node_metrics_dict.items()
}
for metric_mode, node_metrics_dict in calculated_metrics.items()
}
pipeline_overview = _format_pipeline_overview(calculated_metrics=calculated_metrics, graph=graph)
wrong_examples_report = _format_wrong_examples_report(eval_result=eval_result, n_wrong_examples=n_wrong_examples)

View File

@ -82,7 +82,7 @@ def test_retrieval(retriever_with_docs, document_store_with_docs):
retriever_with_docs, TfidfRetriever
):
# single filter
result = retriever_with_docs.retrieve(query="godzilla", filters={"name": ["filename3"]}, top_k=5)
result = retriever_with_docs.retrieve(query="Christelle", filters={"name": ["filename3"]}, top_k=5)
assert len(result) == 1
assert type(result[0]) == Document
assert result[0].content == "My name is Christelle and I live in Paris"
@ -90,14 +90,14 @@ def test_retrieval(retriever_with_docs, document_store_with_docs):
# multiple filters
result = retriever_with_docs.retrieve(
query="godzilla", filters={"name": ["filename2"], "meta_field": ["test2", "test3"]}, top_k=5
query="Paul", filters={"name": ["filename2"], "meta_field": ["test2", "test3"]}, top_k=5
)
assert len(result) == 1
assert type(result[0]) == Document
assert result[0].meta["name"] == "filename2"
result = retriever_with_docs.retrieve(
query="godzilla", filters={"name": ["filename1"], "meta_field": ["test2", "test3"]}, top_k=5
query="Carla", filters={"name": ["filename1"], "meta_field": ["test2", "test3"]}, top_k=5
)
assert len(result) == 0
@ -479,3 +479,38 @@ def test_elasticsearch_highlight():
assert len(results[0].meta["highlighted"]) == 1
assert results[0].meta["highlighted"]["title"] == ["**Green**", "**tea** components"]
def test_elasticsearch_filter_must_not_increase_results():
index = "filter_must_not_increase_results"
client = Elasticsearch()
client.indices.delete(index=index, ignore=[404])
documents = [
{
"content": "The green tea plant contains a range of healthy compounds that make it into the final drink",
"meta": {"content_type": "text"},
"id": "1",
},
{
"content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).",
"meta": {"content_type": "text"},
"id": "2",
},
{
"content": "Green tea also has small amounts of minerals that can benefit your health.",
"meta": {"content_type": "text"},
"id": "3",
},
{
"content": "Green tea does more than just keep you alert, it may also help boost brain function.",
"meta": {"content_type": "text"},
"id": "4",
},
]
doc_store = ElasticsearchDocumentStore(index=index)
doc_store.write_documents(documents)
results_wo_filter = doc_store.query(query="drink")
assert len(results_wo_filter) == 1
results_w_filter = doc_store.query(query="drink", filters={"content_type": "text"})
assert len(results_w_filter) == 1
doc_store.delete_index(index)