From 04d93ec247bb4ccb71f6e7d60945a33e8904407d Mon Sep 17 00:00:00 2001 From: Sowmiya Jaganathan <49642223+SjSnowball@users.noreply.github.com> Date: Tue, 23 Nov 2021 23:40:34 +0530 Subject: [PATCH] Introduced an arg to add synonyms - Elasticsearch (#1625) * Introduced an arg add synonyms to Elasticsearch * Added the test code, removed the whitespace formatting changes, and overwrote the relevant parts from the already existing mapping instead of creating new mapping. * Added the test code * Remove whitespace change * Added the doc_string with examples and link * Removed unneccessary spaces * Add latest docstring and tutorial changes * fix text_field -> content_field Co-authored-by: sowmiya-emplay Co-authored-by: Malte Pietsch Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/_src/api/api/document_store.md | 9 ++++++- haystack/document_stores/elasticsearch.py | 30 ++++++++++++++++++----- test/test_document_store.py | 15 ++++++++++++ 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index d35095d22..0543fb65b 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -160,7 +160,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore) #### \_\_init\_\_ ```python - | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True) + | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym") ``` A DocumentStore using Elasticsearch to store and query the documents for our search. @@ -219,6 +219,13 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea Parameter options: (True, False) False: Raises exception if one or more documents do not have embeddings at query time True: Query will ignore all documents without embeddings (recommended if you concurrently index and query) +- `synonyms`: List of synonyms can be passed while elasticsearch initialization. + For example: [ "foo, bar => baz", + "foozball , foosball" ] + More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html +- `synonym_type`: Synonym filter type can be passed. + Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process. + More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html #### get\_document\_by\_id diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index a7a523534..b32495047 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -52,7 +52,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore): duplicate_documents: str = 'overwrite', index_type: str = "flat", scroll: str = "1d", - skip_missing_embeddings: bool = True + skip_missing_embeddings: bool = True, + synonyms: Optional[List] = None, + synonym_type: str = "synonym" ): """ A DocumentStore using Elasticsearch to store and query the documents for our search. @@ -109,6 +111,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore): Parameter options: (True, False) False: Raises exception if one or more documents do not have embeddings at query time True: Query will ignore all documents without embeddings (recommended if you concurrently index and query) + :param synonyms: List of synonyms can be passed while elasticsearch initialization. + For example: [ "foo, bar => baz", + "foozball , foosball" ] + More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html + :param synonym_type: Synonym filter type can be passed. + Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process. + More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html """ # save init parameters to enable export of component config as YAML @@ -120,7 +129,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore): ca_certs=ca_certs, verify_certs=verify_certs, create_index=create_index, duplicate_documents=duplicate_documents, refresh_type=refresh_type, similarity=similarity, timeout=timeout, return_embedding=return_embedding, index_type=index_type, scroll=scroll, - skip_missing_embeddings=skip_missing_embeddings + skip_missing_embeddings=skip_missing_embeddings, synonyms=synonyms,synonym_type=synonym_type ) self.client = self._init_elastic_client(host=host, port=port, username=username, password=password, @@ -143,6 +152,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore): self.return_embedding = return_embedding self.custom_mapping = custom_mapping + self.synonyms = synonyms + self.synonym_type = synonym_type self.index: str = index self.label_index: str = label_index self.scroll = scroll @@ -276,6 +287,13 @@ class ElasticsearchDocumentStore(BaseDocumentStore): } } } + if self.synonyms: + mapping["mappings"]["properties"][self.content_field] = {"type": "text", "analyzer": "synonym"} + mapping["settings"]["analysis"]["analyzer"]["synonym"] = {"tokenizer": "whitespace", + "filter": ["lowercase", + "synonym"]} + mapping["settings"]["analysis"]["filter"] = {"synonym": {"type": self.synonym_type, "synonyms": self.synonyms}} + if self.embedding_field: mapping["mappings"]["properties"][self.embedding_field] = {"type": "dense_vector", "dims": self.embedding_dim} @@ -761,7 +779,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore): documents = [self._convert_es_hit_to_document(hit, return_embedding=self.return_embedding) for hit in result] return documents - + def query_by_embedding(self, query_emb: np.ndarray, filters: Optional[Dict[str, List[str]]] = None, @@ -1062,7 +1080,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore): } ) query["query"]["bool"] = {"filter": filter_clause} - + if ids: query["query"]["bool"]["must"] = {"ids": {"values": ids}} @@ -1097,7 +1115,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore): In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using the KNN plugin that can scale to a large number of documents. """ - + def __init__(self, verify_certs=False, scheme="https", @@ -1191,7 +1209,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore): for hit in result ] return documents - + def _create_document_index(self, index_name: str): """ Create a new index for storing documents. diff --git a/test/test_document_store.py b/test/test_document_store.py index 4fa1c8e8a..95ea047ef 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -920,3 +920,18 @@ def test_skip_missing_embeddings(): document_store.skip_missing_embeddings = True with pytest.raises(RequestError): document_store.query_by_embedding(np.random.rand(768).astype(np.float32)) + + +@pytest.mark.elasticsearch +def test_elasticsearch_synonyms(): + synonyms = ["i-pod, i pod, ipod", "sea biscuit, sea biscit, seabiscuit", "foo, foo bar, baz"] + synonym_type = "synonym_graph" + + client = Elasticsearch() + client.indices.delete(index='haystack_synonym_arg', ignore=[404]) + document_store = ElasticsearchDocumentStore(index="haystack_synonym_arg", synonyms=synonyms, + synonym_type=synonym_type) + indexed_settings = client.indices.get_settings(index="haystack_synonym_arg") + + assert synonym_type == indexed_settings['haystack_synonym_arg']['settings']['index']['analysis']['filter']['synonym']['type'] + assert synonyms == indexed_settings['haystack_synonym_arg']['settings']['index']['analysis']['filter']['synonym']['synonyms'] \ No newline at end of file