diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index 271abc101..faf712d35 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -502,7 +502,10 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): :param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. """ - body: dict = {"size": 0, "aggs": {"metadata_agg": {"terms": {"field": key}}}} + body: dict = { + "size": 0, + "aggs": {"metadata_agg": {"composite": {"sources": [{key: {"terms": {"field": key}}}]}}}, + } if query: body["query"] = { "bool": { @@ -514,11 +517,23 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): body["query"] = {"bool": {}} body["query"]["bool"].update({"filter": LogicalFilterClause.parse(filters).convert_to_elasticsearch()}) result = self.client.search(body=body, index=index, headers=headers) - buckets = result["aggregations"]["metadata_agg"]["buckets"] - for bucket in buckets: - bucket["count"] = bucket.pop("doc_count") - bucket["value"] = bucket.pop("key") - return buckets + + values = [] + current_buckets = result["aggregations"]["metadata_agg"]["buckets"] + after_key = result["aggregations"]["metadata_agg"].get("after_key", False) + for bucket in current_buckets: + values.append({"value": bucket["key"][key], "count": bucket["doc_count"]}) + + # Only 10 results get returned at a time, so apply pagination + while after_key: + body["aggs"]["metadata_agg"]["composite"]["after"] = after_key + result = self.client.search(body=body, index=index, headers=headers) + current_buckets = result["aggregations"]["metadata_agg"]["buckets"] + after_key = result["aggregations"]["metadata_agg"].get("after_key", False) + for bucket in current_buckets: + values.append({"value": bucket["key"][key], "count": bucket["doc_count"]}) + + return values def write_documents( self, diff --git a/test/document_stores/test_document_store.py b/test/document_stores/test_document_store.py index 59b7f9e8b..a892c7916 100644 --- a/test/document_stores/test_document_store.py +++ b/test/document_stores/test_document_store.py @@ -1202,17 +1202,15 @@ def test_custom_embedding_field(document_store_type, tmp_path): @pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True) def test_get_meta_values_by_key(document_store: BaseDocumentStore): - documents = [ - Document(content="Doc1", meta={"meta_key_1": "1", "meta_key_2": "11"}), - Document(content="Doc2", meta={"meta_key_1": "2", "meta_key_2": "22"}), - Document(content="Doc3", meta={"meta_key_1": "3", "meta_key_2": "33"}), - ] + documents = [Document(content=f"Doc{i}", meta={"meta_key_1": f"{i}", "meta_key_2": f"{i}{i}"}) for i in range(20)] document_store.write_documents(documents) # test without filters or query result = document_store.get_metadata_values_by_key(key="meta_key_1") + possible_values = [f"{i}" for i in range(20)] + assert len(result) == 20 for bucket in result: - assert bucket["value"] in ["1", "2", "3"] + assert bucket["value"] in possible_values assert bucket["count"] == 1 # test with filters but no query