Allow setting of scroll param in ElasticsearchDocumentStore (#1645)

* remove scroll param in ES call

* Add scroll param to ES init

* Add latest docstring and tutorial changes

* Add scroll to set_config

* remove trailing comma

Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
This commit is contained in:
Timo Moeller 2021-10-27 11:07:13 +02:00 committed by GitHub
parent e106eb41a7
commit 1d3f63ac2e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 13 additions and 5 deletions

View File

@ -150,7 +150,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore)
#### \_\_init\_\_
```python
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat")
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d")
```
A DocumentStore using Elasticsearch to store and query the documents for our search.
@ -202,6 +202,9 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea
exists.
- `index_type`: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the
ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does.
- `scroll`: Determines how long the current index is fixed, e.g. during updating all documents with embeddings.
Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h"
For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html
<a name="elasticsearch.ElasticsearchDocumentStore.get_document_by_id"></a>
#### get\_document\_by\_id

View File

@ -50,7 +50,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
timeout=30,
return_embedding: bool = False,
duplicate_documents: str = 'overwrite',
index_type: str = "flat"
index_type: str = "flat",
scroll: str = "1d"
):
"""
A DocumentStore using Elasticsearch to store and query the documents for our search.
@ -100,6 +101,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
exists.
:param index_type: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the
ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does.
:param scroll: Determines how long the current index is fixed, e.g. during updating all documents with embeddings.
Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h"
For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html
"""
# save init parameters to enable export of component config as YAML
@ -110,7 +114,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
custom_mapping=custom_mapping, excluded_meta_data=excluded_meta_data, analyzer=analyzer, scheme=scheme,
ca_certs=ca_certs, verify_certs=verify_certs, create_index=create_index,
duplicate_documents=duplicate_documents, refresh_type=refresh_type, similarity=similarity,
timeout=timeout, return_embedding=return_embedding, index_type=index_type
timeout=timeout, return_embedding=return_embedding, index_type=index_type, scroll=scroll
)
self.client = self._init_elastic_client(host=host, port=port, username=username, password=password,
@ -135,6 +139,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
self.custom_mapping = custom_mapping
self.index: str = index
self.label_index: str = label_index
self.scroll = scroll
if similarity in ["cosine", "dot_product", "l2"]:
self.similarity = similarity
else:
@ -659,7 +664,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
if only_documents_without_embedding:
body['query']['bool']['must_not'] = [{"exists": {"field": self.embedding_field}}]
result = scan(self.client, query=body, index=index, size=batch_size, scroll="1d")
result = scan(self.client, query=body, index=index, size=batch_size, scroll=self.scroll)
yield from result
def query(
@ -1284,4 +1289,4 @@ class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore):
similarity=similarity,
**kwargs)
def _prepare_hosts(self, host, port):
return host
return host