diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index dc0eb21c2..93dc818e1 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -239,7 +239,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore) #### \_\_init\_\_ ```python - | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym") + | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym") ``` A DocumentStore using Elasticsearch to store and query the documents for our search. @@ -274,7 +274,16 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea - `scheme`: 'https' or 'http', protocol used to connect to your elasticsearch instance - `ca_certs`: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine. - `verify_certs`: Whether to be strict about ca certificates -- `create_index`: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case +- `recreate_index`: If set to True, an existing elasticsearch index will be deleted and a new one will be + created using the config you are using for initialization. Be aware that all data in the old index will be + lost if you choose to recreate the index. Be aware that both the document_index and the label_index will + be recreated. +- `create_index`: + Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case) + ..deprecated:: 2.0 + This param is deprecated. In the next major version we will always try to create an index if there is no + existing index (the current behaviour when create_index=True). If you are looking to recreate an + existing index by deleting it first if it already exist use param recreate_index. - `refresh_type`: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search. If set to 'wait_for', continue only after changes are visible (slow, but safe). If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion). @@ -708,6 +717,23 @@ Delete labels in an index. All labels are deleted if no filters are passed. None + +#### delete\_index + +```python + | delete_index(index: str) +``` + +Delete an existing elasticsearch index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + ## OpenSearchDocumentStore diff --git a/docs/_src/tutorials/tutorials/7.md b/docs/_src/tutorials/tutorials/7.md index 36b426a4b..f49d87160 100644 --- a/docs/_src/tutorials/tutorials/7.md +++ b/docs/_src/tutorials/tutorials/7.md @@ -170,6 +170,27 @@ These are used to condition the generator as it generates the answer. What it should return then are novel text spans that form and answer to your question! +```python +# Now generate an answer for each question +for question in QUESTIONS: + # Retrieve related documents from retriever + retriever_results = retriever.retrieve( + query=question + ) + + # Now generate answer from question and retrieved documents + predicted_result = generator.predict( + query=question, + documents=retriever_results, + top_k=1 + ) + + # Print you answer + answers = predicted_result["answers"] + print(f'Generated answer is \'{answers[0].answer}\' for the question = \'{question}\'') +``` + + ```python # Or alternatively use the Pipeline class from haystack.pipelines import GenerativeQAPipeline diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index c4d67d2b3..075a5333b 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -50,6 +50,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, + recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", @@ -93,7 +94,16 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): :param scheme: 'https' or 'http', protocol used to connect to your elasticsearch instance :param ca_certs: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine. :param verify_certs: Whether to be strict about ca certificates - :param create_index: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case + :param recreate_index: If set to True, an existing elasticsearch index will be deleted and a new one will be + created using the config you are using for initialization. Be aware that all data in the old index will be + lost if you choose to recreate the index. Be aware that both the document_index and the label_index will + be recreated. + :param create_index: + Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case) + ..deprecated:: 2.0 + This param is deprecated. In the next major version we will always try to create an index if there is no + existing index (the current behaviour when create_index=True). If you are looking to recreate an + existing index by deleting it first if it already exist use param recreate_index. :param refresh_type: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search. If set to 'wait_for', continue only after changes are visible (slow, but safe). If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion). @@ -175,6 +185,12 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): if index_type == "hnsw" and type(self) == ElasticsearchDocumentStore: raise Exception("The HNSW algorithm for approximate nearest neighbours calculation is currently not available in the ElasticSearchDocumentStore. " "Try the OpenSearchDocumentStore instead.") + if recreate_index: + self.delete_index(index) + self.delete_index(label_index) + self._create_document_index(index) + self._create_label_index(index) + if create_index: self._create_document_index(index) self._create_label_index(label_index) @@ -1243,6 +1259,16 @@ class ElasticsearchDocumentStore(KeywordDocumentStore): index = index or self.label_index self.delete_documents(index=index, ids=ids, filters=filters, headers=headers) + def delete_index(self, index: str): + """ + Delete an existing elasticsearch index. The index including all data will be removed. + + :param index: The name of the index to delete. + :return: None + """ + self.client.indices.delete(index=index, ignore=[400, 404]) + logger.debug(f'deleted elasticsearch index {index}') + class OpenSearchDocumentStore(ElasticsearchDocumentStore): """ diff --git a/test/test_document_store.py b/test/test_document_store.py index cefe9123d..6219ae77f 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -53,6 +53,32 @@ def test_init_elastic_client(): _ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test") +@pytest.mark.elasticsearch +def test_init_elastic_doc_store_with_index_recreation(): + index_name = 'test_index_recreation' + label_index_name = 'test_index_recreation_labels' + + document_store = ElasticsearchDocumentStore(index=index_name, label_index=label_index_name) + documents = [Document(content="Doc1")] + labels = [Label( + query='query', + document=documents[0], + is_correct_document=True, + is_correct_answer=False, + origin='user-feedback', + answer=None + )] + document_store.write_documents(documents, index=index_name) + document_store.write_labels(labels, index=label_index_name) + + document_store = ElasticsearchDocumentStore(index=index_name, label_index=label_index_name, recreate_index=True) + docs = document_store.get_all_documents(index=index_name) + labels = document_store.get_all_labels(index=label_index_name) + + assert len(docs) == 0 + assert len(labels) == 0 + + def test_write_with_duplicate_doc_ids(document_store): duplicate_documents = [ Document( @@ -910,6 +936,24 @@ def test_elasticsearch_custom_fields(): np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding) +@pytest.mark.elasticsearch +def test_elasticsearch_delete_index(): + client = Elasticsearch() + index_name = "haystack_test_deletion" + + document_store = ElasticsearchDocumentStore(index=index_name) + + # the index should exist + index_exists = client.indices.exists(index=index_name) + assert index_exists + + document_store.delete_index(index_name) + + # the index was deleted and should not exist + index_exists = client.indices.exists(index=index_name) + assert not index_exists + + @pytest.mark.elasticsearch def test_get_document_count_only_documents_without_embedding_arg(): documents = [