mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-03 11:19:57 +00:00
Fix number of returned values in get_metadata_values_by_key (#2614)
* Apply pagination in get_metdata_values_by_key * Update Documentation & Code Style * Adapt test * Fix test_eval.py by using pytest.approx Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
6b78990a38
commit
a617ab950b
@ -502,7 +502,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
:param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
|
||||
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
|
||||
"""
|
||||
body: dict = {"size": 0, "aggs": {"metadata_agg": {"terms": {"field": key}}}}
|
||||
body: dict = {
|
||||
"size": 0,
|
||||
"aggs": {"metadata_agg": {"composite": {"sources": [{key: {"terms": {"field": key}}}]}}},
|
||||
}
|
||||
if query:
|
||||
body["query"] = {
|
||||
"bool": {
|
||||
@ -514,11 +517,23 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
body["query"] = {"bool": {}}
|
||||
body["query"]["bool"].update({"filter": LogicalFilterClause.parse(filters).convert_to_elasticsearch()})
|
||||
result = self.client.search(body=body, index=index, headers=headers)
|
||||
buckets = result["aggregations"]["metadata_agg"]["buckets"]
|
||||
for bucket in buckets:
|
||||
bucket["count"] = bucket.pop("doc_count")
|
||||
bucket["value"] = bucket.pop("key")
|
||||
return buckets
|
||||
|
||||
values = []
|
||||
current_buckets = result["aggregations"]["metadata_agg"]["buckets"]
|
||||
after_key = result["aggregations"]["metadata_agg"].get("after_key", False)
|
||||
for bucket in current_buckets:
|
||||
values.append({"value": bucket["key"][key], "count": bucket["doc_count"]})
|
||||
|
||||
# Only 10 results get returned at a time, so apply pagination
|
||||
while after_key:
|
||||
body["aggs"]["metadata_agg"]["composite"]["after"] = after_key
|
||||
result = self.client.search(body=body, index=index, headers=headers)
|
||||
current_buckets = result["aggregations"]["metadata_agg"]["buckets"]
|
||||
after_key = result["aggregations"]["metadata_agg"].get("after_key", False)
|
||||
for bucket in current_buckets:
|
||||
values.append({"value": bucket["key"][key], "count": bucket["doc_count"]})
|
||||
|
||||
return values
|
||||
|
||||
def write_documents(
|
||||
self,
|
||||
|
||||
@ -1202,17 +1202,15 @@ def test_custom_embedding_field(document_store_type, tmp_path):
|
||||
|
||||
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
|
||||
def test_get_meta_values_by_key(document_store: BaseDocumentStore):
|
||||
documents = [
|
||||
Document(content="Doc1", meta={"meta_key_1": "1", "meta_key_2": "11"}),
|
||||
Document(content="Doc2", meta={"meta_key_1": "2", "meta_key_2": "22"}),
|
||||
Document(content="Doc3", meta={"meta_key_1": "3", "meta_key_2": "33"}),
|
||||
]
|
||||
documents = [Document(content=f"Doc{i}", meta={"meta_key_1": f"{i}", "meta_key_2": f"{i}{i}"}) for i in range(20)]
|
||||
document_store.write_documents(documents)
|
||||
|
||||
# test without filters or query
|
||||
result = document_store.get_metadata_values_by_key(key="meta_key_1")
|
||||
possible_values = [f"{i}" for i in range(20)]
|
||||
assert len(result) == 20
|
||||
for bucket in result:
|
||||
assert bucket["value"] in ["1", "2", "3"]
|
||||
assert bucket["value"] in possible_values
|
||||
assert bucket["count"] == 1
|
||||
|
||||
# test with filters but no query
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user