Provide option to recreate es doc store on initialization (#2084)

* provide option to recreate es doc store on initialization * Add latest docstring and tutorial changes * Label expects more arguments * Label expects also an answer Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
2026-01-07 12:37:27 +00:00 · 2022-02-02 11:03:15 +01:00 · 2022-02-02 11:03:15 +01:00 · 88771b2bee
commit 88771b2bee
parent 649d074057
4 changed files with 120 additions and 3 deletions
--- a/docs/_src/api/api/document_store.md
+++ b/docs/_src/api/api/document_store.md
@ -239,7 +239,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore)
 #### \_\_init\_\_

 ```python
- | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym")
+ | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym")
 ```

 A DocumentStore using Elasticsearch to store and query the documents for our search.
@ -274,7 +274,16 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea
 - `scheme`: 'https' or 'http', protocol used to connect to your elasticsearch instance
 - `ca_certs`: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine.
 - `verify_certs`: Whether to be strict about ca certificates
- `create_index`: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case
+- `recreate_index`: If set to True, an existing elasticsearch index will be deleted and a new one will be
+    created using the config you are using for initialization. Be aware that all data in the old index will be
+    lost if you choose to recreate the index. Be aware that both the document_index and the label_index will
+    be recreated.
+- `create_index`: 
+    Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case)
+    ..deprecated:: 2.0
+        This param is deprecated. In the next major version we will always try to create an index if there is no
+        existing index (the current behaviour when create_index=True). If you are looking to recreate an
+        existing index by deleting it first if it already exist use param recreate_index.
 - `refresh_type`: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search.
                     If set to 'wait_for', continue only after changes are visible (slow, but safe).
                     If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion).
@ -708,6 +717,23 @@ Delete labels in an index. All labels are deleted if no filters are passed.

 None

+<a name="elasticsearch.ElasticsearchDocumentStore.delete_index"></a>
+#### delete\_index
+
+```python
+ | delete_index(index: str)
+```
+
+Delete an existing elasticsearch index. The index including all data will be removed.
+
+**Arguments**:
+
+- `index`: The name of the index to delete.
+
+**Returns**:
+
+None
+
 <a name="elasticsearch.OpenSearchDocumentStore"></a>
 ## OpenSearchDocumentStore

--- a/docs/_src/tutorials/tutorials/7.md
+++ b/docs/_src/tutorials/tutorials/7.md
@ -170,6 +170,27 @@ These are used to condition the generator as it generates the answer.
 What it should return then are novel text spans that form and answer to your question!


+```python
+# Now generate an answer for each question
+for question in QUESTIONS:
+    # Retrieve related documents from retriever
+    retriever_results = retriever.retrieve(
+        query=question
+    )
+
+    # Now generate answer from question and retrieved documents
+    predicted_result = generator.predict(
+        query=question,
+        documents=retriever_results,
+        top_k=1
+    )
+
+    # Print you answer
+    answers = predicted_result["answers"]
+    print(f'Generated answer is \'{answers[0].answer}\' for the question = \'{question}\'')
+```
+
+
 ```python
 # Or alternatively use the Pipeline class
 from haystack.pipelines import GenerativeQAPipeline
--- a/haystack/document_stores/elasticsearch.py
+++ b/haystack/document_stores/elasticsearch.py
@ -50,6 +50,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
        scheme: str = "http",
        ca_certs: Optional[str] = None,
        verify_certs: bool = True,
+        recreate_index: bool = False,
        create_index: bool = True,
        refresh_type: str = "wait_for",
        similarity="dot_product",
@ -93,7 +94,16 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
        :param scheme: 'https' or 'http', protocol used to connect to your elasticsearch instance
        :param ca_certs: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine.
        :param verify_certs: Whether to be strict about ca certificates
-        :param create_index: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case
+        :param recreate_index: If set to True, an existing elasticsearch index will be deleted and a new one will be
+            created using the config you are using for initialization. Be aware that all data in the old index will be
+            lost if you choose to recreate the index. Be aware that both the document_index and the label_index will
+            be recreated.
+        :param create_index:
+            Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case)
+            ..deprecated:: 2.0
+                This param is deprecated. In the next major version we will always try to create an index if there is no
+                existing index (the current behaviour when create_index=True). If you are looking to recreate an
+                existing index by deleting it first if it already exist use param recreate_index.
        :param refresh_type: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search.
                             If set to 'wait_for', continue only after changes are visible (slow, but safe).
                             If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion).
@ -175,6 +185,12 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
        if index_type == "hnsw" and type(self) == ElasticsearchDocumentStore:
            raise Exception("The HNSW algorithm for approximate nearest neighbours calculation is currently not available in the ElasticSearchDocumentStore. "
                            "Try the OpenSearchDocumentStore instead.")
+        if recreate_index:
+            self.delete_index(index)
+            self.delete_index(label_index)
+            self._create_document_index(index)
+            self._create_label_index(index)
+
        if create_index:
            self._create_document_index(index)
            self._create_label_index(label_index)
@ -1243,6 +1259,16 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
        index = index or self.label_index
        self.delete_documents(index=index, ids=ids, filters=filters, headers=headers)

+    def delete_index(self, index: str):
+        """
+        Delete an existing elasticsearch index. The index including all data will be removed.
+
+        :param index: The name of the index to delete.
+        :return: None
+        """
+        self.client.indices.delete(index=index, ignore=[400, 404])
+        logger.debug(f'deleted elasticsearch index {index}')
+

 class OpenSearchDocumentStore(ElasticsearchDocumentStore):
    """
--- a/test/test_document_store.py
+++ b/test/test_document_store.py
@ -53,6 +53,32 @@ def test_init_elastic_client():
    _ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test")


+@pytest.mark.elasticsearch
+def test_init_elastic_doc_store_with_index_recreation():
+    index_name = 'test_index_recreation'
+    label_index_name = 'test_index_recreation_labels'
+
+    document_store = ElasticsearchDocumentStore(index=index_name, label_index=label_index_name)
+    documents = [Document(content="Doc1")]
+    labels = [Label(
+        query='query',
+        document=documents[0],
+        is_correct_document=True,
+        is_correct_answer=False,
+        origin='user-feedback',
+        answer=None
+    )]
+    document_store.write_documents(documents, index=index_name)
+    document_store.write_labels(labels, index=label_index_name)
+
+    document_store = ElasticsearchDocumentStore(index=index_name, label_index=label_index_name, recreate_index=True)
+    docs = document_store.get_all_documents(index=index_name)
+    labels = document_store.get_all_labels(index=label_index_name)
+
+    assert len(docs) == 0
+    assert len(labels) == 0
+
+
 def test_write_with_duplicate_doc_ids(document_store):
    duplicate_documents = [
        Document(
@ -910,6 +936,24 @@ def test_elasticsearch_custom_fields():
    np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding)


+@pytest.mark.elasticsearch
+def test_elasticsearch_delete_index():
+    client = Elasticsearch()
+    index_name = "haystack_test_deletion"
+
+    document_store = ElasticsearchDocumentStore(index=index_name)
+
+    # the index should exist
+    index_exists = client.indices.exists(index=index_name)
+    assert index_exists
+
+    document_store.delete_index(index_name)
+
+    # the index was deleted and should not exist
+    index_exists = client.indices.exists(index=index_name)
+    assert not index_exists
+
+
@pytest.mark.elasticsearch
 def test_get_document_count_only_documents_without_embedding_arg():
    documents = [