mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-05 19:36:55 +00:00
Allow setting of scroll param in ElasticsearchDocumentStore (#1645)
* remove scroll param in ES call * Add scroll param to ES init * Add latest docstring and tutorial changes * Add scroll to set_config * remove trailing comma Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
This commit is contained in:
parent
e106eb41a7
commit
1d3f63ac2e
@ -150,7 +150,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore)
|
|||||||
#### \_\_init\_\_
|
#### \_\_init\_\_
|
||||||
|
|
||||||
```python
|
```python
|
||||||
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat")
|
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d")
|
||||||
```
|
```
|
||||||
|
|
||||||
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
||||||
@ -202,6 +202,9 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea
|
|||||||
exists.
|
exists.
|
||||||
- `index_type`: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the
|
- `index_type`: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the
|
||||||
ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does.
|
ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does.
|
||||||
|
- `scroll`: Determines how long the current index is fixed, e.g. during updating all documents with embeddings.
|
||||||
|
Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h"
|
||||||
|
For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html
|
||||||
|
|
||||||
<a name="elasticsearch.ElasticsearchDocumentStore.get_document_by_id"></a>
|
<a name="elasticsearch.ElasticsearchDocumentStore.get_document_by_id"></a>
|
||||||
#### get\_document\_by\_id
|
#### get\_document\_by\_id
|
||||||
|
|||||||
@ -50,7 +50,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
|||||||
timeout=30,
|
timeout=30,
|
||||||
return_embedding: bool = False,
|
return_embedding: bool = False,
|
||||||
duplicate_documents: str = 'overwrite',
|
duplicate_documents: str = 'overwrite',
|
||||||
index_type: str = "flat"
|
index_type: str = "flat",
|
||||||
|
scroll: str = "1d"
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
||||||
@ -100,6 +101,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
|||||||
exists.
|
exists.
|
||||||
:param index_type: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the
|
:param index_type: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the
|
||||||
ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does.
|
ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does.
|
||||||
|
:param scroll: Determines how long the current index is fixed, e.g. during updating all documents with embeddings.
|
||||||
|
Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h"
|
||||||
|
For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# save init parameters to enable export of component config as YAML
|
# save init parameters to enable export of component config as YAML
|
||||||
@ -110,7 +114,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
|||||||
custom_mapping=custom_mapping, excluded_meta_data=excluded_meta_data, analyzer=analyzer, scheme=scheme,
|
custom_mapping=custom_mapping, excluded_meta_data=excluded_meta_data, analyzer=analyzer, scheme=scheme,
|
||||||
ca_certs=ca_certs, verify_certs=verify_certs, create_index=create_index,
|
ca_certs=ca_certs, verify_certs=verify_certs, create_index=create_index,
|
||||||
duplicate_documents=duplicate_documents, refresh_type=refresh_type, similarity=similarity,
|
duplicate_documents=duplicate_documents, refresh_type=refresh_type, similarity=similarity,
|
||||||
timeout=timeout, return_embedding=return_embedding, index_type=index_type
|
timeout=timeout, return_embedding=return_embedding, index_type=index_type, scroll=scroll
|
||||||
)
|
)
|
||||||
|
|
||||||
self.client = self._init_elastic_client(host=host, port=port, username=username, password=password,
|
self.client = self._init_elastic_client(host=host, port=port, username=username, password=password,
|
||||||
@ -135,6 +139,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
|||||||
self.custom_mapping = custom_mapping
|
self.custom_mapping = custom_mapping
|
||||||
self.index: str = index
|
self.index: str = index
|
||||||
self.label_index: str = label_index
|
self.label_index: str = label_index
|
||||||
|
self.scroll = scroll
|
||||||
if similarity in ["cosine", "dot_product", "l2"]:
|
if similarity in ["cosine", "dot_product", "l2"]:
|
||||||
self.similarity = similarity
|
self.similarity = similarity
|
||||||
else:
|
else:
|
||||||
@ -659,7 +664,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
|||||||
if only_documents_without_embedding:
|
if only_documents_without_embedding:
|
||||||
body['query']['bool']['must_not'] = [{"exists": {"field": self.embedding_field}}]
|
body['query']['bool']['must_not'] = [{"exists": {"field": self.embedding_field}}]
|
||||||
|
|
||||||
result = scan(self.client, query=body, index=index, size=batch_size, scroll="1d")
|
result = scan(self.client, query=body, index=index, size=batch_size, scroll=self.scroll)
|
||||||
yield from result
|
yield from result
|
||||||
|
|
||||||
def query(
|
def query(
|
||||||
@ -1284,4 +1289,4 @@ class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore):
|
|||||||
similarity=similarity,
|
similarity=similarity,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
def _prepare_hosts(self, host, port):
|
def _prepare_hosts(self, host, port):
|
||||||
return host
|
return host
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user