Introduced an arg to add synonyms - Elasticsearch (#1625)

* Introduced an arg add synonyms to Elasticsearch

* Added the test code, removed the whitespace formatting changes, and overwrote the relevant parts from the already existing mapping instead of creating new mapping.

* Added the test code

* Remove whitespace change

* Added the doc_string with examples and link

* Removed unneccessary spaces

* Add latest docstring and tutorial changes

* fix text_field -> content_field

Co-authored-by: sowmiya-emplay <sowmiya.j@emplay.net>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Sowmiya Jaganathan 2021-11-23 23:40:34 +05:30 committed by GitHub
parent 565cb7035d
commit 04d93ec247
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 47 additions and 7 deletions

View File

@ -160,7 +160,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore)
#### \_\_init\_\_
```python
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True)
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym")
```
A DocumentStore using Elasticsearch to store and query the documents for our search.
@ -219,6 +219,13 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea
Parameter options: (True, False)
False: Raises exception if one or more documents do not have embeddings at query time
True: Query will ignore all documents without embeddings (recommended if you concurrently index and query)
- `synonyms`: List of synonyms can be passed while elasticsearch initialization.
For example: [ "foo, bar => baz",
"foozball , foosball" ]
More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
- `synonym_type`: Synonym filter type can be passed.
Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process.
More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html
<a name="elasticsearch.ElasticsearchDocumentStore.get_document_by_id"></a>
#### get\_document\_by\_id

View File

@ -52,7 +52,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
duplicate_documents: str = 'overwrite',
index_type: str = "flat",
scroll: str = "1d",
skip_missing_embeddings: bool = True
skip_missing_embeddings: bool = True,
synonyms: Optional[List] = None,
synonym_type: str = "synonym"
):
"""
A DocumentStore using Elasticsearch to store and query the documents for our search.
@ -109,6 +111,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
Parameter options: (True, False)
False: Raises exception if one or more documents do not have embeddings at query time
True: Query will ignore all documents without embeddings (recommended if you concurrently index and query)
:param synonyms: List of synonyms can be passed while elasticsearch initialization.
For example: [ "foo, bar => baz",
"foozball , foosball" ]
More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
:param synonym_type: Synonym filter type can be passed.
Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process.
More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html
"""
# save init parameters to enable export of component config as YAML
@ -120,7 +129,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
ca_certs=ca_certs, verify_certs=verify_certs, create_index=create_index,
duplicate_documents=duplicate_documents, refresh_type=refresh_type, similarity=similarity,
timeout=timeout, return_embedding=return_embedding, index_type=index_type, scroll=scroll,
skip_missing_embeddings=skip_missing_embeddings
skip_missing_embeddings=skip_missing_embeddings, synonyms=synonyms,synonym_type=synonym_type
)
self.client = self._init_elastic_client(host=host, port=port, username=username, password=password,
@ -143,6 +152,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
self.return_embedding = return_embedding
self.custom_mapping = custom_mapping
self.synonyms = synonyms
self.synonym_type = synonym_type
self.index: str = index
self.label_index: str = label_index
self.scroll = scroll
@ -276,6 +287,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
}
}
}
if self.synonyms:
mapping["mappings"]["properties"][self.content_field] = {"type": "text", "analyzer": "synonym"}
mapping["settings"]["analysis"]["analyzer"]["synonym"] = {"tokenizer": "whitespace",
"filter": ["lowercase",
"synonym"]}
mapping["settings"]["analysis"]["filter"] = {"synonym": {"type": self.synonym_type, "synonyms": self.synonyms}}
if self.embedding_field:
mapping["mappings"]["properties"][self.embedding_field] = {"type": "dense_vector", "dims": self.embedding_dim}
@ -761,7 +779,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
documents = [self._convert_es_hit_to_document(hit, return_embedding=self.return_embedding) for hit in result]
return documents
def query_by_embedding(self,
query_emb: np.ndarray,
filters: Optional[Dict[str, List[str]]] = None,
@ -1062,7 +1080,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
}
)
query["query"]["bool"] = {"filter": filter_clause}
if ids:
query["query"]["bool"]["must"] = {"ids": {"values": ids}}
@ -1097,7 +1115,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using
the KNN plugin that can scale to a large number of documents.
"""
def __init__(self,
verify_certs=False,
scheme="https",
@ -1191,7 +1209,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
for hit in result
]
return documents
def _create_document_index(self, index_name: str):
"""
Create a new index for storing documents.

View File

@ -920,3 +920,18 @@ def test_skip_missing_embeddings():
document_store.skip_missing_embeddings = True
with pytest.raises(RequestError):
document_store.query_by_embedding(np.random.rand(768).astype(np.float32))
@pytest.mark.elasticsearch
def test_elasticsearch_synonyms():
synonyms = ["i-pod, i pod, ipod", "sea biscuit, sea biscit, seabiscuit", "foo, foo bar, baz"]
synonym_type = "synonym_graph"
client = Elasticsearch()
client.indices.delete(index='haystack_synonym_arg', ignore=[404])
document_store = ElasticsearchDocumentStore(index="haystack_synonym_arg", synonyms=synonyms,
synonym_type=synonym_type)
indexed_settings = client.indices.get_settings(index="haystack_synonym_arg")
assert synonym_type == indexed_settings['haystack_synonym_arg']['settings']['index']['analysis']['filter']['synonym']['type']
assert synonyms == indexed_settings['haystack_synonym_arg']['settings']['index']['analysis']['filter']['synonym']['synonyms']