mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-10 06:43:58 +00:00
Fix number of returned values in get_metadata_values_by_key (#2614)
* Apply pagination in get_metdata_values_by_key * Update Documentation & Code Style * Adapt test * Fix test_eval.py by using pytest.approx Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
6b78990a38
commit
a617ab950b
@ -502,7 +502,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
|||||||
:param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
|
:param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
|
||||||
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
|
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
|
||||||
"""
|
"""
|
||||||
body: dict = {"size": 0, "aggs": {"metadata_agg": {"terms": {"field": key}}}}
|
body: dict = {
|
||||||
|
"size": 0,
|
||||||
|
"aggs": {"metadata_agg": {"composite": {"sources": [{key: {"terms": {"field": key}}}]}}},
|
||||||
|
}
|
||||||
if query:
|
if query:
|
||||||
body["query"] = {
|
body["query"] = {
|
||||||
"bool": {
|
"bool": {
|
||||||
@ -514,11 +517,23 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
|||||||
body["query"] = {"bool": {}}
|
body["query"] = {"bool": {}}
|
||||||
body["query"]["bool"].update({"filter": LogicalFilterClause.parse(filters).convert_to_elasticsearch()})
|
body["query"]["bool"].update({"filter": LogicalFilterClause.parse(filters).convert_to_elasticsearch()})
|
||||||
result = self.client.search(body=body, index=index, headers=headers)
|
result = self.client.search(body=body, index=index, headers=headers)
|
||||||
buckets = result["aggregations"]["metadata_agg"]["buckets"]
|
|
||||||
for bucket in buckets:
|
values = []
|
||||||
bucket["count"] = bucket.pop("doc_count")
|
current_buckets = result["aggregations"]["metadata_agg"]["buckets"]
|
||||||
bucket["value"] = bucket.pop("key")
|
after_key = result["aggregations"]["metadata_agg"].get("after_key", False)
|
||||||
return buckets
|
for bucket in current_buckets:
|
||||||
|
values.append({"value": bucket["key"][key], "count": bucket["doc_count"]})
|
||||||
|
|
||||||
|
# Only 10 results get returned at a time, so apply pagination
|
||||||
|
while after_key:
|
||||||
|
body["aggs"]["metadata_agg"]["composite"]["after"] = after_key
|
||||||
|
result = self.client.search(body=body, index=index, headers=headers)
|
||||||
|
current_buckets = result["aggregations"]["metadata_agg"]["buckets"]
|
||||||
|
after_key = result["aggregations"]["metadata_agg"].get("after_key", False)
|
||||||
|
for bucket in current_buckets:
|
||||||
|
values.append({"value": bucket["key"][key], "count": bucket["doc_count"]})
|
||||||
|
|
||||||
|
return values
|
||||||
|
|
||||||
def write_documents(
|
def write_documents(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -1202,17 +1202,15 @@ def test_custom_embedding_field(document_store_type, tmp_path):
|
|||||||
|
|
||||||
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
|
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
|
||||||
def test_get_meta_values_by_key(document_store: BaseDocumentStore):
|
def test_get_meta_values_by_key(document_store: BaseDocumentStore):
|
||||||
documents = [
|
documents = [Document(content=f"Doc{i}", meta={"meta_key_1": f"{i}", "meta_key_2": f"{i}{i}"}) for i in range(20)]
|
||||||
Document(content="Doc1", meta={"meta_key_1": "1", "meta_key_2": "11"}),
|
|
||||||
Document(content="Doc2", meta={"meta_key_1": "2", "meta_key_2": "22"}),
|
|
||||||
Document(content="Doc3", meta={"meta_key_1": "3", "meta_key_2": "33"}),
|
|
||||||
]
|
|
||||||
document_store.write_documents(documents)
|
document_store.write_documents(documents)
|
||||||
|
|
||||||
# test without filters or query
|
# test without filters or query
|
||||||
result = document_store.get_metadata_values_by_key(key="meta_key_1")
|
result = document_store.get_metadata_values_by_key(key="meta_key_1")
|
||||||
|
possible_values = [f"{i}" for i in range(20)]
|
||||||
|
assert len(result) == 20
|
||||||
for bucket in result:
|
for bucket in result:
|
||||||
assert bucket["value"] in ["1", "2", "3"]
|
assert bucket["value"] in possible_values
|
||||||
assert bucket["count"] == 1
|
assert bucket["count"] == 1
|
||||||
|
|
||||||
# test with filters but no query
|
# test with filters but no query
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user