bug: make ElasticSearchDocumentStore use batch_size in get_documents_by_id (#3166)

* use batch_size * try to fix git mess * improve docstrings * fix
2025-12-24 13:38:53 +00:00 · 2022-09-26 13:21:59 +02:00 · 2022-09-26 13:21:59 +02:00 · b579b9d54a
commit b579b9d54a
parent 9ca3ccae98
2 changed files with 32 additions and 7 deletions
--- a/docs/_src/api/api/document_store.md
+++ b/docs/_src/api/api/document_store.md
@ -609,8 +609,19 @@ def get_documents_by_id(
        headers: Optional[Dict[str, str]] = None) -> List[Document]
 ```

-Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead
-to performance issues. Note that Elasticsearch limits the number of results to 10,000 documents by default.
+Fetch documents by specifying a list of text id strings.
+
+**Arguments**:
+
+- `ids`: List of document IDs. Be aware that passing a large number of ids might lead to performance issues.
+- `index`: Elasticsearch index where the documents are stored. If not supplied,
+self.index will be used.
+- `batch_size`: Maximum number of results for each query.
+By default, Elasticsearch limits the number of results to 10,000 documents.
+To reduce the pressure on the Elasticsearch cluster, you can lower this limit, at the expense
+of longer retrieval times.
+- `headers`: Custom HTTP headers to pass to Elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.

 <a id="elasticsearch.BaseElasticsearchDocumentStore.get_metadata_values_by_key"></a>

--- a/haystack/document_stores/elasticsearch.py
+++ b/haystack/document_stores/elasticsearch.py
@ -336,13 +336,27 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore):
        headers: Optional[Dict[str, str]] = None,
    ) -> List[Document]:
        """
-        Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead
-        to performance issues. Note that Elasticsearch limits the number of results to 10,000 documents by default.
+        Fetch documents by specifying a list of text id strings.
+
+        :param ids: List of document IDs. Be aware that passing a large number of ids might lead to performance issues.
+        :param index: Elasticsearch index where the documents are stored. If not supplied,
+                      self.index will be used.
+        :param batch_size: Maximum number of results for each query.
+                           By default, Elasticsearch limits the number of results to 10,000 documents.
+                           To reduce the pressure on the Elasticsearch cluster, you can lower this limit, at the expense
+                           of longer retrieval times.
+        :param headers: Custom HTTP headers to pass to Elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
+                        Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
        """
        index = index or self.index
-        query = {"size": len(ids), "query": {"ids": {"values": ids}}}
-        result = self.client.search(index=index, body=query, headers=headers)["hits"]["hits"]
-        documents = [self._convert_es_hit_to_document(hit, return_embedding=self.return_embedding) for hit in result]
+        documents = []
+        for i in range(0, len(ids), batch_size):
+            ids_for_batch = ids[i : i + batch_size]
+            query = {"size": len(ids_for_batch), "query": {"ids": {"values": ids_for_batch}}}
+            result = self.client.search(index=index, body=query, headers=headers)["hits"]["hits"]
+            documents.extend(
+                [self._convert_es_hit_to_document(hit, return_embedding=self.return_embedding) for hit in result]
+            )
        return documents

    def get_metadata_values_by_key(