mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-14 17:38:24 +00:00
Introduced an arg to add synonyms - Elasticsearch (#1625)
* Introduced an arg add synonyms to Elasticsearch * Added the test code, removed the whitespace formatting changes, and overwrote the relevant parts from the already existing mapping instead of creating new mapping. * Added the test code * Remove whitespace change * Added the doc_string with examples and link * Removed unneccessary spaces * Add latest docstring and tutorial changes * fix text_field -> content_field Co-authored-by: sowmiya-emplay <sowmiya.j@emplay.net> Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
565cb7035d
commit
04d93ec247
@ -160,7 +160,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore)
|
|||||||
#### \_\_init\_\_
|
#### \_\_init\_\_
|
||||||
|
|
||||||
```python
|
```python
|
||||||
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True)
|
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym")
|
||||||
```
|
```
|
||||||
|
|
||||||
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
||||||
@ -219,6 +219,13 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea
|
|||||||
Parameter options: (True, False)
|
Parameter options: (True, False)
|
||||||
False: Raises exception if one or more documents do not have embeddings at query time
|
False: Raises exception if one or more documents do not have embeddings at query time
|
||||||
True: Query will ignore all documents without embeddings (recommended if you concurrently index and query)
|
True: Query will ignore all documents without embeddings (recommended if you concurrently index and query)
|
||||||
|
- `synonyms`: List of synonyms can be passed while elasticsearch initialization.
|
||||||
|
For example: [ "foo, bar => baz",
|
||||||
|
"foozball , foosball" ]
|
||||||
|
More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
|
||||||
|
- `synonym_type`: Synonym filter type can be passed.
|
||||||
|
Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process.
|
||||||
|
More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html
|
||||||
|
|
||||||
<a name="elasticsearch.ElasticsearchDocumentStore.get_document_by_id"></a>
|
<a name="elasticsearch.ElasticsearchDocumentStore.get_document_by_id"></a>
|
||||||
#### get\_document\_by\_id
|
#### get\_document\_by\_id
|
||||||
|
@ -52,7 +52,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
|||||||
duplicate_documents: str = 'overwrite',
|
duplicate_documents: str = 'overwrite',
|
||||||
index_type: str = "flat",
|
index_type: str = "flat",
|
||||||
scroll: str = "1d",
|
scroll: str = "1d",
|
||||||
skip_missing_embeddings: bool = True
|
skip_missing_embeddings: bool = True,
|
||||||
|
synonyms: Optional[List] = None,
|
||||||
|
synonym_type: str = "synonym"
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
||||||
@ -109,6 +111,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
|||||||
Parameter options: (True, False)
|
Parameter options: (True, False)
|
||||||
False: Raises exception if one or more documents do not have embeddings at query time
|
False: Raises exception if one or more documents do not have embeddings at query time
|
||||||
True: Query will ignore all documents without embeddings (recommended if you concurrently index and query)
|
True: Query will ignore all documents without embeddings (recommended if you concurrently index and query)
|
||||||
|
:param synonyms: List of synonyms can be passed while elasticsearch initialization.
|
||||||
|
For example: [ "foo, bar => baz",
|
||||||
|
"foozball , foosball" ]
|
||||||
|
More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
|
||||||
|
:param synonym_type: Synonym filter type can be passed.
|
||||||
|
Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process.
|
||||||
|
More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# save init parameters to enable export of component config as YAML
|
# save init parameters to enable export of component config as YAML
|
||||||
@ -120,7 +129,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
|||||||
ca_certs=ca_certs, verify_certs=verify_certs, create_index=create_index,
|
ca_certs=ca_certs, verify_certs=verify_certs, create_index=create_index,
|
||||||
duplicate_documents=duplicate_documents, refresh_type=refresh_type, similarity=similarity,
|
duplicate_documents=duplicate_documents, refresh_type=refresh_type, similarity=similarity,
|
||||||
timeout=timeout, return_embedding=return_embedding, index_type=index_type, scroll=scroll,
|
timeout=timeout, return_embedding=return_embedding, index_type=index_type, scroll=scroll,
|
||||||
skip_missing_embeddings=skip_missing_embeddings
|
skip_missing_embeddings=skip_missing_embeddings, synonyms=synonyms,synonym_type=synonym_type
|
||||||
)
|
)
|
||||||
|
|
||||||
self.client = self._init_elastic_client(host=host, port=port, username=username, password=password,
|
self.client = self._init_elastic_client(host=host, port=port, username=username, password=password,
|
||||||
@ -143,6 +152,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
|||||||
self.return_embedding = return_embedding
|
self.return_embedding = return_embedding
|
||||||
|
|
||||||
self.custom_mapping = custom_mapping
|
self.custom_mapping = custom_mapping
|
||||||
|
self.synonyms = synonyms
|
||||||
|
self.synonym_type = synonym_type
|
||||||
self.index: str = index
|
self.index: str = index
|
||||||
self.label_index: str = label_index
|
self.label_index: str = label_index
|
||||||
self.scroll = scroll
|
self.scroll = scroll
|
||||||
@ -276,6 +287,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if self.synonyms:
|
||||||
|
mapping["mappings"]["properties"][self.content_field] = {"type": "text", "analyzer": "synonym"}
|
||||||
|
mapping["settings"]["analysis"]["analyzer"]["synonym"] = {"tokenizer": "whitespace",
|
||||||
|
"filter": ["lowercase",
|
||||||
|
"synonym"]}
|
||||||
|
mapping["settings"]["analysis"]["filter"] = {"synonym": {"type": self.synonym_type, "synonyms": self.synonyms}}
|
||||||
|
|
||||||
if self.embedding_field:
|
if self.embedding_field:
|
||||||
mapping["mappings"]["properties"][self.embedding_field] = {"type": "dense_vector", "dims": self.embedding_dim}
|
mapping["mappings"]["properties"][self.embedding_field] = {"type": "dense_vector", "dims": self.embedding_dim}
|
||||||
|
|
||||||
@ -761,7 +779,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
|||||||
|
|
||||||
documents = [self._convert_es_hit_to_document(hit, return_embedding=self.return_embedding) for hit in result]
|
documents = [self._convert_es_hit_to_document(hit, return_embedding=self.return_embedding) for hit in result]
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
def query_by_embedding(self,
|
def query_by_embedding(self,
|
||||||
query_emb: np.ndarray,
|
query_emb: np.ndarray,
|
||||||
filters: Optional[Dict[str, List[str]]] = None,
|
filters: Optional[Dict[str, List[str]]] = None,
|
||||||
@ -1062,7 +1080,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
query["query"]["bool"] = {"filter": filter_clause}
|
query["query"]["bool"] = {"filter": filter_clause}
|
||||||
|
|
||||||
if ids:
|
if ids:
|
||||||
query["query"]["bool"]["must"] = {"ids": {"values": ids}}
|
query["query"]["bool"]["must"] = {"ids": {"values": ids}}
|
||||||
|
|
||||||
@ -1097,7 +1115,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
|
|||||||
In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using
|
In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using
|
||||||
the KNN plugin that can scale to a large number of documents.
|
the KNN plugin that can scale to a large number of documents.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
verify_certs=False,
|
verify_certs=False,
|
||||||
scheme="https",
|
scheme="https",
|
||||||
@ -1191,7 +1209,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
|
|||||||
for hit in result
|
for hit in result
|
||||||
]
|
]
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
def _create_document_index(self, index_name: str):
|
def _create_document_index(self, index_name: str):
|
||||||
"""
|
"""
|
||||||
Create a new index for storing documents.
|
Create a new index for storing documents.
|
||||||
|
@ -920,3 +920,18 @@ def test_skip_missing_embeddings():
|
|||||||
document_store.skip_missing_embeddings = True
|
document_store.skip_missing_embeddings = True
|
||||||
with pytest.raises(RequestError):
|
with pytest.raises(RequestError):
|
||||||
document_store.query_by_embedding(np.random.rand(768).astype(np.float32))
|
document_store.query_by_embedding(np.random.rand(768).astype(np.float32))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.elasticsearch
|
||||||
|
def test_elasticsearch_synonyms():
|
||||||
|
synonyms = ["i-pod, i pod, ipod", "sea biscuit, sea biscit, seabiscuit", "foo, foo bar, baz"]
|
||||||
|
synonym_type = "synonym_graph"
|
||||||
|
|
||||||
|
client = Elasticsearch()
|
||||||
|
client.indices.delete(index='haystack_synonym_arg', ignore=[404])
|
||||||
|
document_store = ElasticsearchDocumentStore(index="haystack_synonym_arg", synonyms=synonyms,
|
||||||
|
synonym_type=synonym_type)
|
||||||
|
indexed_settings = client.indices.get_settings(index="haystack_synonym_arg")
|
||||||
|
|
||||||
|
assert synonym_type == indexed_settings['haystack_synonym_arg']['settings']['index']['analysis']['filter']['synonym']['type']
|
||||||
|
assert synonyms == indexed_settings['haystack_synonym_arg']['settings']['index']['analysis']['filter']['synonym']['synonyms']
|
Loading…
x
Reference in New Issue
Block a user