diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index edf11fb58..e8a619075 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -609,8 +609,19 @@ def get_documents_by_id( headers: Optional[Dict[str, str]] = None) -> List[Document] ``` -Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead -to performance issues. Note that Elasticsearch limits the number of results to 10,000 documents by default. +Fetch documents by specifying a list of text id strings. + +**Arguments**: + +- `ids`: List of document IDs. Be aware that passing a large number of ids might lead to performance issues. +- `index`: Elasticsearch index where the documents are stored. If not supplied, +self.index will be used. +- `batch_size`: Maximum number of results for each query. +By default, Elasticsearch limits the number of results to 10,000 documents. +To reduce the pressure on the Elasticsearch cluster, you can lower this limit, at the expense +of longer retrieval times. +- `headers`: Custom HTTP headers to pass to Elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index d82fc475f..d343507a8 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -336,13 +336,27 @@ class BaseElasticsearchDocumentStore(KeywordDocumentStore): headers: Optional[Dict[str, str]] = None, ) -> List[Document]: """ - Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead - to performance issues. Note that Elasticsearch limits the number of results to 10,000 documents by default. + Fetch documents by specifying a list of text id strings. + + :param ids: List of document IDs. Be aware that passing a large number of ids might lead to performance issues. + :param index: Elasticsearch index where the documents are stored. If not supplied, + self.index will be used. + :param batch_size: Maximum number of results for each query. + By default, Elasticsearch limits the number of results to 10,000 documents. + To reduce the pressure on the Elasticsearch cluster, you can lower this limit, at the expense + of longer retrieval times. + :param headers: Custom HTTP headers to pass to Elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. """ index = index or self.index - query = {"size": len(ids), "query": {"ids": {"values": ids}}} - result = self.client.search(index=index, body=query, headers=headers)["hits"]["hits"] - documents = [self._convert_es_hit_to_document(hit, return_embedding=self.return_embedding) for hit in result] + documents = [] + for i in range(0, len(ids), batch_size): + ids_for_batch = ids[i : i + batch_size] + query = {"size": len(ids_for_batch), "query": {"ids": {"values": ids_for_batch}}} + result = self.client.search(index=index, body=query, headers=headers)["hits"]["hits"] + documents.extend( + [self._convert_es_hit_to_document(hit, return_embedding=self.return_embedding) for hit in result] + ) return documents def get_metadata_values_by_key(