mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-07 12:37:27 +00:00
Provide option to recreate es doc store on initialization (#2084)
* provide option to recreate es doc store on initialization * Add latest docstring and tutorial changes * Label expects more arguments * Label expects also an answer Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
This commit is contained in:
parent
649d074057
commit
88771b2bee
@ -239,7 +239,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore)
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym")
|
||||
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym")
|
||||
```
|
||||
|
||||
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
||||
@ -274,7 +274,16 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea
|
||||
- `scheme`: 'https' or 'http', protocol used to connect to your elasticsearch instance
|
||||
- `ca_certs`: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine.
|
||||
- `verify_certs`: Whether to be strict about ca certificates
|
||||
- `create_index`: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case
|
||||
- `recreate_index`: If set to True, an existing elasticsearch index will be deleted and a new one will be
|
||||
created using the config you are using for initialization. Be aware that all data in the old index will be
|
||||
lost if you choose to recreate the index. Be aware that both the document_index and the label_index will
|
||||
be recreated.
|
||||
- `create_index`:
|
||||
Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case)
|
||||
..deprecated:: 2.0
|
||||
This param is deprecated. In the next major version we will always try to create an index if there is no
|
||||
existing index (the current behaviour when create_index=True). If you are looking to recreate an
|
||||
existing index by deleting it first if it already exist use param recreate_index.
|
||||
- `refresh_type`: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search.
|
||||
If set to 'wait_for', continue only after changes are visible (slow, but safe).
|
||||
If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion).
|
||||
@ -708,6 +717,23 @@ Delete labels in an index. All labels are deleted if no filters are passed.
|
||||
|
||||
None
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.delete_index"></a>
|
||||
#### delete\_index
|
||||
|
||||
```python
|
||||
| delete_index(index: str)
|
||||
```
|
||||
|
||||
Delete an existing elasticsearch index. The index including all data will be removed.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: The name of the index to delete.
|
||||
|
||||
**Returns**:
|
||||
|
||||
None
|
||||
|
||||
<a name="elasticsearch.OpenSearchDocumentStore"></a>
|
||||
## OpenSearchDocumentStore
|
||||
|
||||
|
||||
@ -170,6 +170,27 @@ These are used to condition the generator as it generates the answer.
|
||||
What it should return then are novel text spans that form and answer to your question!
|
||||
|
||||
|
||||
```python
|
||||
# Now generate an answer for each question
|
||||
for question in QUESTIONS:
|
||||
# Retrieve related documents from retriever
|
||||
retriever_results = retriever.retrieve(
|
||||
query=question
|
||||
)
|
||||
|
||||
# Now generate answer from question and retrieved documents
|
||||
predicted_result = generator.predict(
|
||||
query=question,
|
||||
documents=retriever_results,
|
||||
top_k=1
|
||||
)
|
||||
|
||||
# Print you answer
|
||||
answers = predicted_result["answers"]
|
||||
print(f'Generated answer is \'{answers[0].answer}\' for the question = \'{question}\'')
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# Or alternatively use the Pipeline class
|
||||
from haystack.pipelines import GenerativeQAPipeline
|
||||
|
||||
@ -50,6 +50,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
scheme: str = "http",
|
||||
ca_certs: Optional[str] = None,
|
||||
verify_certs: bool = True,
|
||||
recreate_index: bool = False,
|
||||
create_index: bool = True,
|
||||
refresh_type: str = "wait_for",
|
||||
similarity="dot_product",
|
||||
@ -93,7 +94,16 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
:param scheme: 'https' or 'http', protocol used to connect to your elasticsearch instance
|
||||
:param ca_certs: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine.
|
||||
:param verify_certs: Whether to be strict about ca certificates
|
||||
:param create_index: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case
|
||||
:param recreate_index: If set to True, an existing elasticsearch index will be deleted and a new one will be
|
||||
created using the config you are using for initialization. Be aware that all data in the old index will be
|
||||
lost if you choose to recreate the index. Be aware that both the document_index and the label_index will
|
||||
be recreated.
|
||||
:param create_index:
|
||||
Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case)
|
||||
..deprecated:: 2.0
|
||||
This param is deprecated. In the next major version we will always try to create an index if there is no
|
||||
existing index (the current behaviour when create_index=True). If you are looking to recreate an
|
||||
existing index by deleting it first if it already exist use param recreate_index.
|
||||
:param refresh_type: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search.
|
||||
If set to 'wait_for', continue only after changes are visible (slow, but safe).
|
||||
If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion).
|
||||
@ -175,6 +185,12 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
if index_type == "hnsw" and type(self) == ElasticsearchDocumentStore:
|
||||
raise Exception("The HNSW algorithm for approximate nearest neighbours calculation is currently not available in the ElasticSearchDocumentStore. "
|
||||
"Try the OpenSearchDocumentStore instead.")
|
||||
if recreate_index:
|
||||
self.delete_index(index)
|
||||
self.delete_index(label_index)
|
||||
self._create_document_index(index)
|
||||
self._create_label_index(index)
|
||||
|
||||
if create_index:
|
||||
self._create_document_index(index)
|
||||
self._create_label_index(label_index)
|
||||
@ -1243,6 +1259,16 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
|
||||
index = index or self.label_index
|
||||
self.delete_documents(index=index, ids=ids, filters=filters, headers=headers)
|
||||
|
||||
def delete_index(self, index: str):
|
||||
"""
|
||||
Delete an existing elasticsearch index. The index including all data will be removed.
|
||||
|
||||
:param index: The name of the index to delete.
|
||||
:return: None
|
||||
"""
|
||||
self.client.indices.delete(index=index, ignore=[400, 404])
|
||||
logger.debug(f'deleted elasticsearch index {index}')
|
||||
|
||||
|
||||
class OpenSearchDocumentStore(ElasticsearchDocumentStore):
|
||||
"""
|
||||
|
||||
@ -53,6 +53,32 @@ def test_init_elastic_client():
|
||||
_ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test")
|
||||
|
||||
|
||||
@pytest.mark.elasticsearch
|
||||
def test_init_elastic_doc_store_with_index_recreation():
|
||||
index_name = 'test_index_recreation'
|
||||
label_index_name = 'test_index_recreation_labels'
|
||||
|
||||
document_store = ElasticsearchDocumentStore(index=index_name, label_index=label_index_name)
|
||||
documents = [Document(content="Doc1")]
|
||||
labels = [Label(
|
||||
query='query',
|
||||
document=documents[0],
|
||||
is_correct_document=True,
|
||||
is_correct_answer=False,
|
||||
origin='user-feedback',
|
||||
answer=None
|
||||
)]
|
||||
document_store.write_documents(documents, index=index_name)
|
||||
document_store.write_labels(labels, index=label_index_name)
|
||||
|
||||
document_store = ElasticsearchDocumentStore(index=index_name, label_index=label_index_name, recreate_index=True)
|
||||
docs = document_store.get_all_documents(index=index_name)
|
||||
labels = document_store.get_all_labels(index=label_index_name)
|
||||
|
||||
assert len(docs) == 0
|
||||
assert len(labels) == 0
|
||||
|
||||
|
||||
def test_write_with_duplicate_doc_ids(document_store):
|
||||
duplicate_documents = [
|
||||
Document(
|
||||
@ -910,6 +936,24 @@ def test_elasticsearch_custom_fields():
|
||||
np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding)
|
||||
|
||||
|
||||
@pytest.mark.elasticsearch
|
||||
def test_elasticsearch_delete_index():
|
||||
client = Elasticsearch()
|
||||
index_name = "haystack_test_deletion"
|
||||
|
||||
document_store = ElasticsearchDocumentStore(index=index_name)
|
||||
|
||||
# the index should exist
|
||||
index_exists = client.indices.exists(index=index_name)
|
||||
assert index_exists
|
||||
|
||||
document_store.delete_index(index_name)
|
||||
|
||||
# the index was deleted and should not exist
|
||||
index_exists = client.indices.exists(index=index_name)
|
||||
assert not index_exists
|
||||
|
||||
|
||||
@pytest.mark.elasticsearch
|
||||
def test_get_document_count_only_documents_without_embedding_arg():
|
||||
documents = [
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user