Fix number of returned values in get_metadata_values_by_key (#2614)

* Apply pagination in get_metdata_values_by_key

* Update Documentation & Code Style

* Adapt test

* Fix test_eval.py by using pytest.approx

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
bogdankostic 2022-06-01 10:21:28 +02:00 committed by GitHub
parent 6b78990a38
commit a617ab950b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 12 deletions

View File

@ -502,7 +502,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
:param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
"""
body: dict = {"size": 0, "aggs": {"metadata_agg": {"terms": {"field": key}}}}
body: dict = {
"size": 0,
"aggs": {"metadata_agg": {"composite": {"sources": [{key: {"terms": {"field": key}}}]}}},
}
if query:
body["query"] = {
"bool": {
@ -514,11 +517,23 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
body["query"] = {"bool": {}}
body["query"]["bool"].update({"filter": LogicalFilterClause.parse(filters).convert_to_elasticsearch()})
result = self.client.search(body=body, index=index, headers=headers)
buckets = result["aggregations"]["metadata_agg"]["buckets"]
for bucket in buckets:
bucket["count"] = bucket.pop("doc_count")
bucket["value"] = bucket.pop("key")
return buckets
values = []
current_buckets = result["aggregations"]["metadata_agg"]["buckets"]
after_key = result["aggregations"]["metadata_agg"].get("after_key", False)
for bucket in current_buckets:
values.append({"value": bucket["key"][key], "count": bucket["doc_count"]})
# Only 10 results get returned at a time, so apply pagination
while after_key:
body["aggs"]["metadata_agg"]["composite"]["after"] = after_key
result = self.client.search(body=body, index=index, headers=headers)
current_buckets = result["aggregations"]["metadata_agg"]["buckets"]
after_key = result["aggregations"]["metadata_agg"].get("after_key", False)
for bucket in current_buckets:
values.append({"value": bucket["key"][key], "count": bucket["doc_count"]})
return values
def write_documents(
self,

View File

@ -1202,17 +1202,15 @@ def test_custom_embedding_field(document_store_type, tmp_path):
@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
def test_get_meta_values_by_key(document_store: BaseDocumentStore):
documents = [
Document(content="Doc1", meta={"meta_key_1": "1", "meta_key_2": "11"}),
Document(content="Doc2", meta={"meta_key_1": "2", "meta_key_2": "22"}),
Document(content="Doc3", meta={"meta_key_1": "3", "meta_key_2": "33"}),
]
documents = [Document(content=f"Doc{i}", meta={"meta_key_1": f"{i}", "meta_key_2": f"{i}{i}"}) for i in range(20)]
document_store.write_documents(documents)
# test without filters or query
result = document_store.get_metadata_values_by_key(key="meta_key_1")
possible_values = [f"{i}" for i in range(20)]
assert len(result) == 20
for bucket in result:
assert bucket["value"] in ["1", "2", "3"]
assert bucket["value"] in possible_values
assert bucket["count"] == 1
# test with filters but no query