Fixed the Search Field mapping in ElasticSearch DocumentStore (#2080)

* Review changes

* Added the synonym analyser for search fields

* Added the review requests.

* Added the synonyms the OpenSearchDocumentStore and review requests.
This commit is contained in:
Sowmiya Jaganathan 2022-01-31 15:41:20 +05:30 committed by GitHub
parent bbb65a19bd
commit 7d769d8bf1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 52 additions and 3 deletions

View File

@ -281,7 +281,7 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
"mappings": {
"properties": {
self.name_field: {"type": "keyword"},
self.content_field: {"type": "text"},
self.content_field: {"type": "text"}
},
"dynamic_templates": [
{
@ -301,13 +301,21 @@ class ElasticsearchDocumentStore(KeywordDocumentStore):
}
}
}
if self.synonyms:
for field in self.search_fields:
mapping["mappings"]["properties"].update({field: {"type": "text", "analyzer": "synonym"}})
mapping["mappings"]["properties"][self.content_field] = {"type": "text", "analyzer": "synonym"}
mapping["settings"]["analysis"]["analyzer"]["synonym"] = {"tokenizer": "whitespace",
"filter": ["lowercase",
"synonym"]}
mapping["settings"]["analysis"]["filter"] = {"synonym": {"type": self.synonym_type, "synonyms": self.synonyms}}
else:
for field in self.search_fields:
mapping["mappings"]["properties"].update({field: {"type": "text"}})
if self.embedding_field:
mapping["mappings"]["properties"][self.embedding_field] = {"type": "dense_vector", "dims": self.embedding_dim}
@ -1353,7 +1361,7 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
"mappings": {
"properties": {
self.name_field: {"type": "keyword"},
self.content_field: {"type": "text"},
self.content_field: {"type": "text"}
},
"dynamic_templates": [
{
@ -1373,6 +1381,21 @@ class OpenSearchDocumentStore(ElasticsearchDocumentStore):
}
}
}
if self.synonyms:
for field in self.search_fields:
mapping["mappings"]["properties"].update({field: {"type": "text", "analyzer": "synonym"}})
mapping["mappings"]["properties"][self.content_field] = {"type": "text", "analyzer": "synonym"}
mapping["settings"]["analysis"]["analyzer"]["synonym"] = {"tokenizer": "whitespace",
"filter": ["lowercase",
"synonym"]}
mapping["settings"]["analysis"]["filter"] = {"synonym": {"type": self.synonym_type, "synonyms": self.synonyms}}
else:
for field in self.search_fields:
mapping["mappings"]["properties"].update({field: {"type": "text"}})
if self.embedding_field:
if self.similarity == "cosine":

View File

@ -1196,4 +1196,30 @@ def test_DeepsetCloudDocumentStore_query_by_embedding(deepset_cloud_document_sto
responses.add_passthru(DC_API_ENDPOINT)
emb_docs = deepset_cloud_document_store.query_by_embedding(query_emb)
assert len(emb_docs) == 0
assert len(emb_docs) == 0
@pytest.mark.elasticsearch
def test_elasticsearch_search_field_mapping():
client = Elasticsearch()
client.indices.delete(index='haystack_search_field_mapping', ignore=[404])
index_data = [
{"title": "Green tea components",
"meta": {"content": "The green tea plant contains a range of healthy compounds that make it into the final drink","sub_content":"Drink tip"},"id": "1"},
{"title": "Green tea catechin",
"meta": {"content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).","sub_content":"Ingredients tip"}, "id": "2"},
{"title": "Minerals in Green tea",
"meta": {"content": "Green tea also has small amounts of minerals that can benefit your health.","sub_content":"Minerals tip"}, "id": "3"},
{"title": "Green tea Benefits",
"meta": {"content": "Green tea does more than just keep you alert, it may also help boost brain function.","sub_content":"Health tip"},"id": "4"}
]
document_store = ElasticsearchDocumentStore(index="haystack_search_field_mapping",search_fields=["content", "sub_content"],content_field= "title")
document_store.write_documents(index_data)
indexed_settings = client.indices.get_mapping(index="haystack_search_field_mapping")
assert indexed_settings["haystack_search_field_mapping"]["mappings"]["properties"]["content"]["type"] == 'text'
assert indexed_settings["haystack_search_field_mapping"]["mappings"]["properties"]["sub_content"]["type"] == 'text'