mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-25 14:59:01 +00:00 
			
		
		
		
	Add DocumentStore for Open Distro Elasticsearch (#676)
This commit is contained in:
		
							parent
							
								
									33fe597949
								
							
						
					
					
						commit
						369e237fd4
					
				| @ -111,21 +111,18 @@ class ElasticsearchDocumentStore(BaseDocumentStore): | ||||
|         self.custom_mapping = custom_mapping | ||||
|         self.index: str = index | ||||
|         self.label_index: str = label_index | ||||
|         if similarity in ["cosine", "dot_product"]: | ||||
|             self.similarity = similarity | ||||
|         else: | ||||
|             raise Exception("Invalid value for similarity in ElasticSearchDocumentStore constructor. Choose between 'cosine' and 'dot_product'") | ||||
|         if create_index: | ||||
|             self._create_document_index(index) | ||||
|             self._create_label_index(label_index) | ||||
| 
 | ||||
|         self.update_existing_documents = update_existing_documents | ||||
|         self.refresh_type = refresh_type | ||||
|         self.similarity = similarity | ||||
|         if similarity == "cosine": | ||||
|             self.similarity_fn_name = "cosineSimilarity" | ||||
|         elif similarity == "dot_product": | ||||
|             self.similarity_fn_name = "dotProduct" | ||||
|         else: | ||||
|             raise Exception("Invalid value for similarity in ElasticSearchDocumentStore constructor. Choose between \'cosine\' and \'dot_product\'") | ||||
| 
 | ||||
|     def _create_document_index(self, index_name): | ||||
|     def _create_document_index(self, index_name: str): | ||||
|         """ | ||||
|         Create a new index for storing documents. In case if an index with the name already exists, it ensures that | ||||
|         the embedding_field is present. | ||||
| @ -182,7 +179,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore): | ||||
|             if not self.client.indices.exists(index=index_name): | ||||
|                 raise e | ||||
| 
 | ||||
|     def _create_label_index(self, index_name): | ||||
|     def _create_label_index(self, index_name: str): | ||||
|         if self.client.indices.exists(index=index_name): | ||||
|             return | ||||
|         mapping = { | ||||
| @ -531,22 +528,10 @@ class ElasticsearchDocumentStore(BaseDocumentStore): | ||||
|             raise RuntimeError("Please specify arg `embedding_field` in ElasticsearchDocumentStore()") | ||||
|         else: | ||||
|             # +1 in similarity to avoid negative numbers (for cosine sim) | ||||
|             body= { | ||||
|             body = { | ||||
|                 "size": top_k, | ||||
|                 "query": { | ||||
|                     "script_score": { | ||||
|                         "query": {"match_all": {}}, | ||||
|                         "script": { | ||||
|                             # offset score to ensure a positive range as required by Elasticsearch | ||||
|                             "source": f"{self.similarity_fn_name}(params.query_vector,'{self.embedding_field}') + 1000", | ||||
|                             "params": { | ||||
|                                 "query_vector": query_emb.tolist() | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             }  # type: Dict[str,Any] | ||||
| 
 | ||||
|                 "query": self._get_vector_similarity_query(query_emb, top_k) | ||||
|             } | ||||
|             if filters: | ||||
|                 for key, values in filters.items(): | ||||
|                     if type(values) != list: | ||||
| @ -580,6 +565,29 @@ class ElasticsearchDocumentStore(BaseDocumentStore): | ||||
|             ] | ||||
|             return documents | ||||
| 
 | ||||
|     def _get_vector_similarity_query(self, query_emb: np.array, top_k: int): | ||||
|         """ | ||||
|         Generate Elasticsearch query for vector similarity. | ||||
|         """ | ||||
|         if self.similarity == "cosine": | ||||
|             similarity_fn_name = "cosineSimilarity" | ||||
|         elif self.similarity == "dot_product": | ||||
|             similarity_fn_name = "dotProduct" | ||||
|         else: | ||||
|             raise Exception("Invalid value for similarity in ElasticSearchDocumentStore constructor. Choose between \'cosine\' and \'dot_product\'") | ||||
| 
 | ||||
|         query = { | ||||
|             "script_score": { | ||||
|                 "query": {"match_all": {}}, | ||||
|                 "script": { | ||||
|                     # offset score to ensure a positive range as required by Elasticsearch | ||||
|                     "source": f"{similarity_fn_name}(params.query_vector,'{self.embedding_field}') + 1000", | ||||
|                     "params": {"query_vector": query_emb.tolist()}, | ||||
|                 }, | ||||
|             } | ||||
|         } | ||||
|         return query | ||||
| 
 | ||||
|     def _convert_es_hit_to_document( | ||||
|             self, | ||||
|             hit: dict, | ||||
| @ -596,7 +604,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore): | ||||
|         score = hit["_score"] if hit["_score"] else None | ||||
|         if score: | ||||
|             if adapt_score_for_embedding: | ||||
|                 score -= 1000 | ||||
|                 score = self._scale_embedding_score(score) | ||||
|                 if self.similarity == "cosine": | ||||
|                     probability = (score + 1) / 2  # scaling probability from cosine similarity | ||||
|                 elif self.similarity == "dot_product": | ||||
| @ -623,6 +631,9 @@ class ElasticsearchDocumentStore(BaseDocumentStore): | ||||
|         ) | ||||
|         return document | ||||
| 
 | ||||
|     def _scale_embedding_score(self, score): | ||||
|         return score - 1000 | ||||
| 
 | ||||
|     def describe_documents(self, index=None): | ||||
|         """ | ||||
|         Return a summary of the documents in the document store | ||||
| @ -717,7 +728,78 @@ class ElasticsearchDocumentStore(BaseDocumentStore): | ||||
|         time.sleep(1) | ||||
| 
 | ||||
| 
 | ||||
| class OpenDistroElasticsearchDocumentStore(ElasticsearchDocumentStore): | ||||
|     """ | ||||
|     Document Store using the Open Distro for Elasticsearch. It is compatible with the AWS Elasticsearch Service. | ||||
| 
 | ||||
|     In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using | ||||
|     the KNN plugin that can scale to a large number of documents. | ||||
|     """ | ||||
| 
 | ||||
|     def _create_document_index(self, index_name: str): | ||||
|         """ | ||||
|         Create a new index for storing documents. | ||||
|         """ | ||||
| 
 | ||||
|         if self.custom_mapping: | ||||
|             mapping = self.custom_mapping | ||||
|         else: | ||||
|             mapping = { | ||||
|                 "mappings": { | ||||
|                     "properties": { | ||||
|                         self.name_field: {"type": "keyword"}, | ||||
|                         self.text_field: {"type": "text"}, | ||||
|                     }, | ||||
|                     "dynamic_templates": [ | ||||
|                         { | ||||
|                             "strings": { | ||||
|                                 "path_match": "*", | ||||
|                                 "match_mapping_type": "string", | ||||
|                                 "mapping": {"type": "keyword"}}} | ||||
|                     ], | ||||
|                 }, | ||||
|                 "settings": { | ||||
|                     "analysis": { | ||||
|                         "analyzer": { | ||||
|                             "default": { | ||||
|                                 "type": self.analyzer, | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             if self.embedding_field: | ||||
|                 if self.similarity == "cosine": | ||||
|                     similarity_space_type = "cosinesimil" | ||||
|                 elif self.similarity == "dot_product": | ||||
|                     similarity_space_type = "l2" | ||||
|                 else: | ||||
|                     raise Exception( | ||||
|                         f"Similarity function {self.similarity} is not supported by OpenDistroElasticsearchDocumentStore." | ||||
|                     ) | ||||
|                 mapping["settings"]["knn"] = True | ||||
|                 mapping["settings"]["knn.space_type"] = similarity_space_type | ||||
|                 mapping["mappings"]["properties"][self.embedding_field] = { | ||||
|                     "type": "knn_vector", | ||||
|                     "dimension": self.embedding_dim, | ||||
|                 } | ||||
| 
 | ||||
|         try: | ||||
|             self.client.indices.create(index=index_name, body=mapping) | ||||
|         except RequestError as e: | ||||
|             # With multiple workers we need to avoid race conditions, where: | ||||
|             # - there's no index in the beginning | ||||
|             # - both want to create one | ||||
|             # - one fails as the other one already created it | ||||
|             if not self.client.indices.exists(index=index_name): | ||||
|                 raise e | ||||
| 
 | ||||
|     def _get_vector_similarity_query(self, query_emb: np.array, top_k: int): | ||||
|         """ | ||||
|         Generate Elasticsearch query for vector similarity. | ||||
|         """ | ||||
|         query = {"knn": {self.embedding_field: {"vector": query_emb.tolist(), "k": top_k}}} | ||||
|         return query | ||||
| 
 | ||||
|     def _scale_embedding_score(self, score): | ||||
|         return score | ||||
| @ -179,7 +179,8 @@ class BaseRetriever(ABC): | ||||
|             documents = self.retrieve(query=query, filters=filters, top_k=top_k_retriever) | ||||
|         else: | ||||
|             documents = self.retrieve(query=query, filters=filters) | ||||
| 
 | ||||
|         document_ids = [doc.id for doc in documents] | ||||
|         logger.debug(f"Retrieved documents with IDs: {document_ids}") | ||||
|         output = { | ||||
|             "query": query, | ||||
|             "documents": documents, | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Tanay Soni
						Tanay Soni