From 04d93ec247bb4ccb71f6e7d60945a33e8904407d Mon Sep 17 00:00:00 2001
From: Sowmiya Jaganathan <49642223+SjSnowball@users.noreply.github.com>
Date: Tue, 23 Nov 2021 23:40:34 +0530
Subject: [PATCH]  Introduced an arg to add synonyms - Elasticsearch (#1625)

* Introduced an arg add synonyms to Elasticsearch

* Added the test code, removed the whitespace formatting changes, and overwrote the relevant parts from the already existing mapping instead of creating new mapping.

* Added the test code

* Remove whitespace change

* Added the doc_string with examples and link

* Removed unneccessary spaces

* Add latest docstring and tutorial changes

* fix text_field -> content_field

Co-authored-by: sowmiya-emplay <sowmiya.j@emplay.net>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 docs/_src/api/api/document_store.md       |  9 ++++++-
 haystack/document_stores/elasticsearch.py | 30 ++++++++++++++++++-----
 test/test_document_store.py               | 15 ++++++++++++
 3 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md
index d35095d22..0543fb65b 100644
--- a/docs/_src/api/api/document_store.md
+++ b/docs/_src/api/api/document_store.md
@@ -160,7 +160,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore)
 #### \_\_init\_\_
 
 ```python
- | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True)
+ | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym")
 ```
 
 A DocumentStore using Elasticsearch to store and query the documents for our search.
@@ -219,6 +219,13 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea
                                 Parameter options: (True, False)
                                 False: Raises exception if one or more documents do not have embeddings at query time
                                 True: Query will ignore all documents without embeddings (recommended if you concurrently index and query)
+- `synonyms`: List of synonyms can be passed while elasticsearch initialization.
+                 For example: [ "foo, bar => baz",
+                                "foozball , foosball" ]
+                 More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
+- `synonym_type`: Synonym filter type can be passed.
+                     Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process.
+                     More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html
 
 <a name="elasticsearch.ElasticsearchDocumentStore.get_document_by_id"></a>
 #### get\_document\_by\_id
diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py
index a7a523534..b32495047 100644
--- a/haystack/document_stores/elasticsearch.py
+++ b/haystack/document_stores/elasticsearch.py
@@ -52,7 +52,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
         duplicate_documents: str = 'overwrite',
         index_type: str = "flat",
         scroll: str = "1d",
-        skip_missing_embeddings: bool = True
+        skip_missing_embeddings: bool = True,
+        synonyms: Optional[List] = None,
+        synonym_type: str = "synonym"
     ):
         """
         A DocumentStore using Elasticsearch to store and query the documents for our search.
@@ -109,6 +111,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
                                         Parameter options: (True, False)
                                         False: Raises exception if one or more documents do not have embeddings at query time
                                         True: Query will ignore all documents without embeddings (recommended if you concurrently index and query)
+        :param synonyms: List of synonyms can be passed while elasticsearch initialization.
+                         For example: [ "foo, bar => baz",
+                                        "foozball , foosball" ]
+                         More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
+        :param synonym_type: Synonym filter type can be passed.
+                             Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process.
+                             More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html
 
         """
         # save init parameters to enable export of component config as YAML
@@ -120,7 +129,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
             ca_certs=ca_certs, verify_certs=verify_certs, create_index=create_index,
             duplicate_documents=duplicate_documents, refresh_type=refresh_type, similarity=similarity,
             timeout=timeout, return_embedding=return_embedding, index_type=index_type, scroll=scroll,
-            skip_missing_embeddings=skip_missing_embeddings
+            skip_missing_embeddings=skip_missing_embeddings, synonyms=synonyms,synonym_type=synonym_type
         )
 
         self.client = self._init_elastic_client(host=host, port=port, username=username, password=password,
@@ -143,6 +152,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
         self.return_embedding = return_embedding
 
         self.custom_mapping = custom_mapping
+        self.synonyms = synonyms
+        self.synonym_type = synonym_type
         self.index: str = index
         self.label_index: str = label_index
         self.scroll = scroll
@@ -276,6 +287,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
                     }
                 }
             }
+            if self.synonyms:
+                mapping["mappings"]["properties"][self.content_field] = {"type": "text", "analyzer": "synonym"}
+                mapping["settings"]["analysis"]["analyzer"]["synonym"] = {"tokenizer": "whitespace",
+                                                                          "filter": ["lowercase",
+                                                                                     "synonym"]}
+                mapping["settings"]["analysis"]["filter"] = {"synonym": {"type": self.synonym_type, "synonyms": self.synonyms}}
+
             if self.embedding_field:
                 mapping["mappings"]["properties"][self.embedding_field] = {"type": "dense_vector", "dims": self.embedding_dim}
 
@@ -761,7 +779,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
 
         documents = [self._convert_es_hit_to_document(hit, return_embedding=self.return_embedding) for hit in result]
         return documents
-        
+
     def query_by_embedding(self,
                            query_emb: np.ndarray,
                            filters: Optional[Dict[str, List[str]]] = None,
@@ -1062,7 +1080,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
                         }
                 )
                 query["query"]["bool"] = {"filter": filter_clause}
-            
+
             if ids:
                 query["query"]["bool"]["must"] = {"ids": {"values": ids}}
 
@@ -1097,7 +1115,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
     In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using
     the KNN plugin that can scale to a large number of documents.
     """
-    
+
     def __init__(self,
                  verify_certs=False,
                  scheme="https",
@@ -1191,7 +1209,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
                 for hit in result
             ]
             return documents
-    
+
     def _create_document_index(self, index_name: str):
         """
         Create a new index for storing documents.
diff --git a/test/test_document_store.py b/test/test_document_store.py
index 4fa1c8e8a..95ea047ef 100644
--- a/test/test_document_store.py
+++ b/test/test_document_store.py
@@ -920,3 +920,18 @@ def test_skip_missing_embeddings():
     document_store.skip_missing_embeddings = True
     with pytest.raises(RequestError):
         document_store.query_by_embedding(np.random.rand(768).astype(np.float32))
+
+
+@pytest.mark.elasticsearch
+def test_elasticsearch_synonyms():
+    synonyms = ["i-pod, i pod, ipod", "sea biscuit, sea biscit, seabiscuit", "foo, foo bar, baz"]
+    synonym_type = "synonym_graph"
+
+    client = Elasticsearch()
+    client.indices.delete(index='haystack_synonym_arg', ignore=[404])
+    document_store = ElasticsearchDocumentStore(index="haystack_synonym_arg", synonyms=synonyms,
+                                                synonym_type=synonym_type)
+    indexed_settings = client.indices.get_settings(index="haystack_synonym_arg")
+
+    assert synonym_type == indexed_settings['haystack_synonym_arg']['settings']['index']['analysis']['filter']['synonym']['type']
+    assert synonyms == indexed_settings['haystack_synonym_arg']['settings']['index']['analysis']['filter']['synonym']['synonyms']
\ No newline at end of file