Introduced an arg to add synonyms - Elasticsearch (#1625)

* Introduced an arg add synonyms to Elasticsearch * Added the test code, removed the whitespace formatting changes, and overwrote the relevant parts from the already existing mapping instead of creating new mapping. * Added the test code * Remove whitespace change * Added the doc_string with examples and link * Removed unneccessary spaces * Add latest docstring and tutorial changes * fix text_field -> content_field Co-authored-by: sowmiya-emplay <sowmiya.j@emplay.net> Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2025-10-14 09:28:56 +00:00 · 2021-11-23 23:40:34 +05:30 · 2021-11-23 23:40:34 +05:30 · 04d93ec247
commit 04d93ec247
parent 565cb7035d
3 changed files with 47 additions and 7 deletions
--- a/docs/_src/api/api/document_store.md
+++ b/docs/_src/api/api/document_store.md
@ -160,7 +160,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore)
 #### \_\_init\_\_

 ```python
- | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True)
+ | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym")
 ```

 A DocumentStore using Elasticsearch to store and query the documents for our search.
@ -219,6 +219,13 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea
                                Parameter options: (True, False)
                                False: Raises exception if one or more documents do not have embeddings at query time
                                True: Query will ignore all documents without embeddings (recommended if you concurrently index and query)
+- `synonyms`: List of synonyms can be passed while elasticsearch initialization.
+                 For example: [ "foo, bar => baz",
+                                "foozball , foosball" ]
+                 More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
+- `synonym_type`: Synonym filter type can be passed.
+                     Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process.
+                     More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html

 <a name="elasticsearch.ElasticsearchDocumentStore.get_document_by_id"></a>
 #### get\_document\_by\_id
--- a/haystack/document_stores/elasticsearch.py
+++ b/haystack/document_stores/elasticsearch.py
@ -52,7 +52,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
        duplicate_documents: str = 'overwrite',
        index_type: str = "flat",
        scroll: str = "1d",
-        skip_missing_embeddings: bool = True
+        skip_missing_embeddings: bool = True,
+        synonyms: Optional[List] = None,
+        synonym_type: str = "synonym"
    ):
        """
        A DocumentStore using Elasticsearch to store and query the documents for our search.
@ -109,6 +111,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
                                        Parameter options: (True, False)
                                        False: Raises exception if one or more documents do not have embeddings at query time
                                        True: Query will ignore all documents without embeddings (recommended if you concurrently index and query)
+        :param synonyms: List of synonyms can be passed while elasticsearch initialization.
+                         For example: [ "foo, bar => baz",
+                                        "foozball , foosball" ]
+                         More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
+        :param synonym_type: Synonym filter type can be passed.
+                             Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process.
+                             More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html

        """
        # save init parameters to enable export of component config as YAML
@ -120,7 +129,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
            ca_certs=ca_certs, verify_certs=verify_certs, create_index=create_index,
            duplicate_documents=duplicate_documents, refresh_type=refresh_type, similarity=similarity,
            timeout=timeout, return_embedding=return_embedding, index_type=index_type, scroll=scroll,
-            skip_missing_embeddings=skip_missing_embeddings
+            skip_missing_embeddings=skip_missing_embeddings, synonyms=synonyms,synonym_type=synonym_type
        )

        self.client = self._init_elastic_client(host=host, port=port, username=username, password=password,
@ -143,6 +152,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
        self.return_embedding = return_embedding

        self.custom_mapping = custom_mapping
+        self.synonyms = synonyms
+        self.synonym_type = synonym_type
        self.index: str = index
        self.label_index: str = label_index
        self.scroll = scroll
@ -276,6 +287,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
                    }
                }
            }
+            if self.synonyms:
+                mapping["mappings"]["properties"][self.content_field] = {"type": "text", "analyzer": "synonym"}
+                mapping["settings"]["analysis"]["analyzer"]["synonym"] = {"tokenizer": "whitespace",
+                                                                          "filter": ["lowercase",
+                                                                                     "synonym"]}
+                mapping["settings"]["analysis"]["filter"] = {"synonym": {"type": self.synonym_type, "synonyms": self.synonyms}}
+
            if self.embedding_field:
                mapping["mappings"]["properties"][self.embedding_field] = {"type": "dense_vector", "dims": self.embedding_dim}

@ -761,7 +779,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):

        documents = [self._convert_es_hit_to_document(hit, return_embedding=self.return_embedding) for hit in result]
        return documents
-        
+
    def query_by_embedding(self,
                           query_emb: np.ndarray,
                           filters: Optional[Dict[str, List[str]]] = None,
@ -1062,7 +1080,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
                        }
                )
                query["query"]["bool"] = {"filter": filter_clause}
-            
+
            if ids:
                query["query"]["bool"]["must"] = {"ids": {"values": ids}}

@ -1097,7 +1115,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
    In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using
    the KNN plugin that can scale to a large number of documents.
    """
-    
+
    def __init__(self,
                 verify_certs=False,
                 scheme="https",
@ -1191,7 +1209,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
                for hit in result
            ]
            return documents
-    
+
    def _create_document_index(self, index_name: str):
        """
        Create a new index for storing documents.
--- a/test/test_document_store.py
+++ b/test/test_document_store.py
@ -920,3 +920,18 @@ def test_skip_missing_embeddings():
    document_store.skip_missing_embeddings = True
    with pytest.raises(RequestError):
        document_store.query_by_embedding(np.random.rand(768).astype(np.float32))
+
+
+@pytest.mark.elasticsearch
+def test_elasticsearch_synonyms():
+    synonyms = ["i-pod, i pod, ipod", "sea biscuit, sea biscit, seabiscuit", "foo, foo bar, baz"]
+    synonym_type = "synonym_graph"
+
+    client = Elasticsearch()
+    client.indices.delete(index='haystack_synonym_arg', ignore=[404])
+    document_store = ElasticsearchDocumentStore(index="haystack_synonym_arg", synonyms=synonyms,
+                                                synonym_type=synonym_type)
+    indexed_settings = client.indices.get_settings(index="haystack_synonym_arg")
+
+    assert synonym_type == indexed_settings['haystack_synonym_arg']['settings']['index']['analysis']['filter']['synonym']['type']
+    assert synonyms == indexed_settings['haystack_synonym_arg']['settings']['index']['analysis']['filter']['synonym']['synonyms']