mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-05 03:17:31 +00:00
Allow setting of scroll param in ElasticsearchDocumentStore (#1645)
* remove scroll param in ES call * Add scroll param to ES init * Add latest docstring and tutorial changes * Add scroll to set_config * remove trailing comma Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
This commit is contained in:
parent
e106eb41a7
commit
1d3f63ac2e
@ -150,7 +150,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore)
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat")
|
||||
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d")
|
||||
```
|
||||
|
||||
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
||||
@ -202,6 +202,9 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea
|
||||
exists.
|
||||
- `index_type`: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the
|
||||
ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does.
|
||||
- `scroll`: Determines how long the current index is fixed, e.g. during updating all documents with embeddings.
|
||||
Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h"
|
||||
For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.get_document_by_id"></a>
|
||||
#### get\_document\_by\_id
|
||||
|
||||
@ -50,7 +50,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
timeout=30,
|
||||
return_embedding: bool = False,
|
||||
duplicate_documents: str = 'overwrite',
|
||||
index_type: str = "flat"
|
||||
index_type: str = "flat",
|
||||
scroll: str = "1d"
|
||||
):
|
||||
"""
|
||||
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
||||
@ -100,6 +101,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
exists.
|
||||
:param index_type: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the
|
||||
ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does.
|
||||
:param scroll: Determines how long the current index is fixed, e.g. during updating all documents with embeddings.
|
||||
Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h"
|
||||
For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html
|
||||
|
||||
"""
|
||||
# save init parameters to enable export of component config as YAML
|
||||
@ -110,7 +114,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
custom_mapping=custom_mapping, excluded_meta_data=excluded_meta_data, analyzer=analyzer, scheme=scheme,
|
||||
ca_certs=ca_certs, verify_certs=verify_certs, create_index=create_index,
|
||||
duplicate_documents=duplicate_documents, refresh_type=refresh_type, similarity=similarity,
|
||||
timeout=timeout, return_embedding=return_embedding, index_type=index_type
|
||||
timeout=timeout, return_embedding=return_embedding, index_type=index_type, scroll=scroll
|
||||
)
|
||||
|
||||
self.client = self._init_elastic_client(host=host, port=port, username=username, password=password,
|
||||
@ -135,6 +139,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
self.custom_mapping = custom_mapping
|
||||
self.index: str = index
|
||||
self.label_index: str = label_index
|
||||
self.scroll = scroll
|
||||
if similarity in ["cosine", "dot_product", "l2"]:
|
||||
self.similarity = similarity
|
||||
else:
|
||||
@ -659,7 +664,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
||||
if only_documents_without_embedding:
|
||||
body['query']['bool']['must_not'] = [{"exists": {"field": self.embedding_field}}]
|
||||
|
||||
result = scan(self.client, query=body, index=index, size=batch_size, scroll="1d")
|
||||
result = scan(self.client, query=body, index=index, size=batch_size, scroll=self.scroll)
|
||||
yield from result
|
||||
|
||||
def query(
|
||||
@ -1284,4 +1289,4 @@ class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore):
|
||||
similarity=similarity,
|
||||
**kwargs)
|
||||
def _prepare_hosts(self, host, port):
|
||||
return host
|
||||
return host
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user