mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-25 06:48:43 +00:00 
			
		
		
		
	fix: Add separate query method for OpenSearchDocumentStore (#4764)
* Add separate query method for OpenSearchDocumentStore * Convert integration test to unit test + add separate tests for OpenSearch
This commit is contained in:
		
							parent
							
								
									41b6e33f64
								
							
						
					
					
						commit
						c7a20d68d2
					
				| @ -618,6 +618,182 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore): | ||||
|             query_embs, filters, top_k, index, return_embedding, headers, scale_score | ||||
|         ) | ||||
| 
 | ||||
|     def query( | ||||
|         self, | ||||
|         query: Optional[str], | ||||
|         filters: Optional[FilterType] = None, | ||||
|         top_k: int = 10, | ||||
|         custom_query: Optional[str] = None, | ||||
|         index: Optional[str] = None, | ||||
|         headers: Optional[Dict[str, str]] = None, | ||||
|         all_terms_must_match: bool = False, | ||||
|         scale_score: bool = True, | ||||
|     ) -> List[Document]: | ||||
|         """ | ||||
|         Scan through documents in DocumentStore and return a small number documents | ||||
|         that are most relevant to the query as defined by the BM25 algorithm. | ||||
| 
 | ||||
|         :param query: The query | ||||
|         :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain | ||||
|                         conditions. | ||||
|                         Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical | ||||
|                         operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, | ||||
|                         `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. | ||||
|                         Logical operator keys take a dictionary of metadata field names and/or logical operators as | ||||
|                         value. Metadata field names take a dictionary of comparison operators as value. Comparison | ||||
|                         operator keys take a single value or (in case of `"$in"`) a list of values as value. | ||||
|                         If no logical operator is provided, `"$and"` is used as default operation. If no comparison | ||||
|                         operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default | ||||
|                         operation. | ||||
| 
 | ||||
|                             __Example__: | ||||
| 
 | ||||
|                             ```python | ||||
|                             filters = { | ||||
|                                 "$and": { | ||||
|                                     "type": {"$eq": "article"}, | ||||
|                                     "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, | ||||
|                                     "rating": {"$gte": 3}, | ||||
|                                     "$or": { | ||||
|                                         "genre": {"$in": ["economy", "politics"]}, | ||||
|                                         "publisher": {"$eq": "nytimes"} | ||||
|                                     } | ||||
|                                 } | ||||
|                             } | ||||
|                             # or simpler using default operators | ||||
|                             filters = { | ||||
|                                 "type": "article", | ||||
|                                 "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, | ||||
|                                 "rating": {"$gte": 3}, | ||||
|                                 "$or": { | ||||
|                                     "genre": ["economy", "politics"], | ||||
|                                     "publisher": "nytimes" | ||||
|                                 } | ||||
|                             } | ||||
|                             ``` | ||||
| 
 | ||||
|                             To use the same logical operator multiple times on the same level, logical operators take | ||||
|                             optionally a list of dictionaries as value. | ||||
| 
 | ||||
|                             __Example__: | ||||
| 
 | ||||
|                             ```python | ||||
|                             filters = { | ||||
|                                 "$or": [ | ||||
|                                     { | ||||
|                                         "$and": { | ||||
|                                             "Type": "News Paper", | ||||
|                                             "Date": { | ||||
|                                                 "$lt": "2019-01-01" | ||||
|                                             } | ||||
|                                         } | ||||
|                                     }, | ||||
|                                     { | ||||
|                                         "$and": { | ||||
|                                             "Type": "Blog Post", | ||||
|                                             "Date": { | ||||
|                                                 "$gte": "2019-01-01" | ||||
|                                             } | ||||
|                                         } | ||||
|                                     } | ||||
|                                 ] | ||||
|                             } | ||||
|                             ``` | ||||
|         :param top_k: How many documents to return per query. | ||||
|         :param custom_query: query string containing a mandatory `${query}` placeholder. | ||||
| 
 | ||||
|                              Optionally, ES `filter` clause can be added where the values of `terms` are placeholders | ||||
|                              that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..) | ||||
|                              names must match with the filters dict supplied in self.retrieve(). | ||||
|                              :: | ||||
| 
 | ||||
|                                  **An example custom_query:** | ||||
|                                 ```python | ||||
|                                 { | ||||
|                                     "size": 10, | ||||
|                                     "query": { | ||||
|                                         "bool": { | ||||
|                                             "should": [{"multi_match": { | ||||
|                                                 "query": ${query},                 // mandatory query placeholder | ||||
|                                                 "type": "most_fields", | ||||
|                                                 "fields": ["content", "title"]}}], | ||||
|                                             "filter": [                                 // optional custom filters | ||||
|                                                 {"terms": {"year": ${years}}}, | ||||
|                                                 {"terms": {"quarter": ${quarters}}}, | ||||
|                                                 {"range": {"date": {"gte": ${date}}}} | ||||
|                                                 ], | ||||
|                                         } | ||||
|                                     }, | ||||
|                                 } | ||||
|                                  ``` | ||||
| 
 | ||||
|                                 **For this custom_query, a sample retrieve() could be:** | ||||
|                                 ```python | ||||
|                                 self.retrieve(query="Why did the revenue increase?", | ||||
|                                               filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) | ||||
|                                 ``` | ||||
| 
 | ||||
|                              Optionally, highlighting can be defined by specifying the highlight settings. | ||||
|                              See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html. | ||||
|                              You will find the highlighted output in the returned Document's meta field by key "highlighted". | ||||
|                              :: | ||||
| 
 | ||||
|                                  **Example custom_query with highlighting:** | ||||
|                                 ```python | ||||
|                                 { | ||||
|                                     "size": 10, | ||||
|                                     "query": { | ||||
|                                         "bool": { | ||||
|                                             "should": [{"multi_match": { | ||||
|                                                 "query": ${query},                 // mandatory query placeholder | ||||
|                                                 "type": "most_fields", | ||||
|                                                 "fields": ["content", "title"]}}], | ||||
|                                         } | ||||
|                                     }, | ||||
|                                     "highlight": {             // enable highlighting | ||||
|                                         "fields": {            // for fields content and title | ||||
|                                             "content": {}, | ||||
|                                             "title": {} | ||||
|                                         } | ||||
|                                     }, | ||||
|                                 } | ||||
|                                  ``` | ||||
| 
 | ||||
|                                  **For this custom_query, highlighting info can be accessed by:** | ||||
|                                 ```python | ||||
|                                 docs = self.retrieve(query="Why did the revenue increase?") | ||||
|                                 highlighted_content = docs[0].meta["highlighted"]["content"] | ||||
|                                 highlighted_title = docs[0].meta["highlighted"]["title"] | ||||
|                                 ``` | ||||
| 
 | ||||
|         :param index: The name of the index in the DocumentStore from which to retrieve documents | ||||
|         :param headers: Custom HTTP headers to pass to the client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) | ||||
|                 Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. | ||||
|         :param all_terms_must_match: Whether all terms of the query must match the document. | ||||
|                                      If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). | ||||
|                                      Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). | ||||
|                                      Defaults to false. | ||||
|         :param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]). | ||||
|                             If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. | ||||
|                             Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. | ||||
|         """ | ||||
| 
 | ||||
|         if index is None: | ||||
|             index = self.index | ||||
| 
 | ||||
|         body = self._construct_query_body( | ||||
|             query=query, | ||||
|             filters=filters, | ||||
|             top_k=top_k, | ||||
|             custom_query=custom_query, | ||||
|             all_terms_must_match=all_terms_must_match, | ||||
|         ) | ||||
| 
 | ||||
|         result = self.client.search(index=index, body=body, headers=headers)["hits"]["hits"] | ||||
| 
 | ||||
|         documents = [self._convert_es_hit_to_document(hit, scale_score=scale_score) for hit in result] | ||||
|         return documents | ||||
| 
 | ||||
|     def _construct_dense_query_body( | ||||
|         self, query_emb: np.ndarray, return_embedding: bool, filters: Optional[FilterType] = None, top_k: int = 10 | ||||
|     ): | ||||
|  | ||||
| @ -392,6 +392,40 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc | ||||
|         _, kwargs = mocked_open_search_init.call_args | ||||
|         assert "http_auth" not in kwargs | ||||
| 
 | ||||
|     @pytest.mark.unit | ||||
|     def test_query(self, mocked_document_store): | ||||
|         mocked_document_store.query(query=self.query) | ||||
|         kwargs = mocked_document_store.client.search.call_args.kwargs | ||||
|         assert "index" in kwargs | ||||
|         assert "body" in kwargs | ||||
|         assert "headers" in kwargs | ||||
| 
 | ||||
|     @pytest.mark.unit | ||||
|     def test_query_return_embedding_false(self, mocked_document_store): | ||||
|         mocked_document_store.return_embedding = False | ||||
|         mocked_document_store.query(self.query) | ||||
|         # assert the resulting body is consistent with the `excluded_meta_data` value | ||||
|         _, kwargs = mocked_document_store.client.search.call_args | ||||
|         assert kwargs["body"]["_source"] == {"excludes": ["embedding"]} | ||||
| 
 | ||||
|     @pytest.mark.unit | ||||
|     def test_query_excluded_meta_data_return_embedding_true(self, mocked_document_store): | ||||
|         mocked_document_store.return_embedding = True | ||||
|         mocked_document_store.excluded_meta_data = ["foo", "embedding"] | ||||
|         mocked_document_store.query(self.query) | ||||
|         _, kwargs = mocked_document_store.client.search.call_args | ||||
|         # we expect "embedding" was removed from the final query | ||||
|         assert kwargs["body"]["_source"] == {"excludes": ["foo"]} | ||||
| 
 | ||||
|     @pytest.mark.unit | ||||
|     def test_query_excluded_meta_data_return_embedding_false(self, mocked_document_store): | ||||
|         mocked_document_store.return_embedding = False | ||||
|         mocked_document_store.excluded_meta_data = ["foo"] | ||||
|         mocked_document_store.query(self.query) | ||||
|         # assert the resulting body is consistent with the `excluded_meta_data` value | ||||
|         _, kwargs = mocked_document_store.client.search.call_args | ||||
|         assert kwargs["body"]["_source"] == {"excludes": ["foo", "embedding"]} | ||||
| 
 | ||||
|     @pytest.mark.unit | ||||
|     def test_query_by_embedding_raises_if_missing_field(self, mocked_document_store): | ||||
|         mocked_document_store.embedding_field = "" | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 bogdankostic
						bogdankostic