Provide option to recreate es doc store on initialization (#2084)

* provide option to recreate es doc store on initialization

* Add latest docstring and tutorial changes

* Label expects more arguments

* Label expects also an answer

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
This commit is contained in:
mathislucka 2022-02-02 11:03:15 +01:00 committed by GitHub
parent 649d074057
commit 88771b2bee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 120 additions and 3 deletions

View File

@ -239,7 +239,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore)
#### \_\_init\_\_
```python
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym")
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym")
```
A DocumentStore using Elasticsearch to store and query the documents for our search.
@ -274,7 +274,16 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea
- `scheme`: 'https' or 'http', protocol used to connect to your elasticsearch instance
- `ca_certs`: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine.
- `verify_certs`: Whether to be strict about ca certificates
- `create_index`: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case
- `recreate_index`: If set to True, an existing elasticsearch index will be deleted and a new one will be
created using the config you are using for initialization. Be aware that all data in the old index will be
lost if you choose to recreate the index. Be aware that both the document_index and the label_index will
be recreated.
- `create_index`:
Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case)
..deprecated:: 2.0
This param is deprecated. In the next major version we will always try to create an index if there is no
existing index (the current behaviour when create_index=True). If you are looking to recreate an
existing index by deleting it first if it already exist use param recreate_index.
- `refresh_type`: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search.
If set to 'wait_for', continue only after changes are visible (slow, but safe).
If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion).
@ -708,6 +717,23 @@ Delete labels in an index. All labels are deleted if no filters are passed.
None
<a name="elasticsearch.ElasticsearchDocumentStore.delete_index"></a>
#### delete\_index
```python
| delete_index(index: str)
```
Delete an existing elasticsearch index. The index including all data will be removed.
**Arguments**:
- `index`: The name of the index to delete.
**Returns**:
None
<a name="elasticsearch.OpenSearchDocumentStore"></a>
## OpenSearchDocumentStore

View File

@ -170,6 +170,27 @@ These are used to condition the generator as it generates the answer.
What it should return then are novel text spans that form and answer to your question!
```python
# Now generate an answer for each question
for question in QUESTIONS:
# Retrieve related documents from retriever
retriever_results = retriever.retrieve(
query=question
)
# Now generate answer from question and retrieved documents
predicted_result = generator.predict(
query=question,
documents=retriever_results,
top_k=1
)
# Print you answer
answers = predicted_result["answers"]
print(f'Generated answer is \'{answers[0].answer}\' for the question = \'{question}\'')
```
```python
# Or alternatively use the Pipeline class
from haystack.pipelines import GenerativeQAPipeline

View File

@ -50,6 +50,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
scheme: str = "http",
ca_certs: Optional[str] = None,
verify_certs: bool = True,
recreate_index: bool = False,
create_index: bool = True,
refresh_type: str = "wait_for",
similarity="dot_product",
@ -93,7 +94,16 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
:param scheme: 'https' or 'http', protocol used to connect to your elasticsearch instance
:param ca_certs: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine.
:param verify_certs: Whether to be strict about ca certificates
:param create_index: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case
:param recreate_index: If set to True, an existing elasticsearch index will be deleted and a new one will be
created using the config you are using for initialization. Be aware that all data in the old index will be
lost if you choose to recreate the index. Be aware that both the document_index and the label_index will
be recreated.
:param create_index:
Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case)
..deprecated:: 2.0
This param is deprecated. In the next major version we will always try to create an index if there is no
existing index (the current behaviour when create_index=True). If you are looking to recreate an
existing index by deleting it first if it already exist use param recreate_index.
:param refresh_type: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search.
If set to 'wait_for', continue only after changes are visible (slow, but safe).
If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion).
@ -175,6 +185,12 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
if index_type == "hnsw" and type(self) == ElasticsearchDocumentStore:
raise Exception("The HNSW algorithm for approximate nearest neighbours calculation is currently not available in the ElasticSearchDocumentStore. "
"Try the OpenSearchDocumentStore instead.")
if recreate_index:
self.delete_index(index)
self.delete_index(label_index)
self._create_document_index(index)
self._create_label_index(index)
if create_index:
self._create_document_index(index)
self._create_label_index(label_index)
@ -1243,6 +1259,16 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
index = index or self.label_index
self.delete_documents(index=index, ids=ids, filters=filters, headers=headers)
def delete_index(self, index: str):
"""
Delete an existing elasticsearch index. The index including all data will be removed.
:param index: The name of the index to delete.
:return: None
"""
self.client.indices.delete(index=index, ignore=[400, 404])
logger.debug(f'deleted elasticsearch index {index}')
class OpenSearchDocumentStore(ElasticsearchDocumentStore):
"""

View File

@ -53,6 +53,32 @@ def test_init_elastic_client():
_ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test")
@pytest.mark.elasticsearch
def test_init_elastic_doc_store_with_index_recreation():
index_name = 'test_index_recreation'
label_index_name = 'test_index_recreation_labels'
document_store = ElasticsearchDocumentStore(index=index_name, label_index=label_index_name)
documents = [Document(content="Doc1")]
labels = [Label(
query='query',
document=documents[0],
is_correct_document=True,
is_correct_answer=False,
origin='user-feedback',
answer=None
)]
document_store.write_documents(documents, index=index_name)
document_store.write_labels(labels, index=label_index_name)
document_store = ElasticsearchDocumentStore(index=index_name, label_index=label_index_name, recreate_index=True)
docs = document_store.get_all_documents(index=index_name)
labels = document_store.get_all_labels(index=label_index_name)
assert len(docs) == 0
assert len(labels) == 0
def test_write_with_duplicate_doc_ids(document_store):
duplicate_documents = [
Document(
@ -910,6 +936,24 @@ def test_elasticsearch_custom_fields():
np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding)
@pytest.mark.elasticsearch
def test_elasticsearch_delete_index():
client = Elasticsearch()
index_name = "haystack_test_deletion"
document_store = ElasticsearchDocumentStore(index=index_name)
# the index should exist
index_exists = client.indices.exists(index=index_name)
assert index_exists
document_store.delete_index(index_name)
# the index was deleted and should not exist
index_exists = client.indices.exists(index=index_name)
assert not index_exists
@pytest.mark.elasticsearch
def test_get_document_count_only_documents_without_embedding_arg():
documents = [