mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-24 22:38:41 +00:00 
			
		
		
		
	fix: remove warnings from the more recent Elasticsearch client (#4602)
* clean up the ES instance in a more robust way * do not sleep, refresh the index instead * remove client warnings * fix unit tests * fix opensearch compatibility * fix unit tests * update ES version * bump elasticsearch-py * adjust docs * use recreate_index param * use same fixture strategy for Opensearch * Update lg --------- Co-authored-by: agnieszka-m <amarzec13@gmail.com>
This commit is contained in:
		
							parent
							
								
									8c4176bdb2
								
							
						
					
					
						commit
						0c081f19e2
					
				
							
								
								
									
										2
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							| @ -1231,7 +1231,7 @@ jobs: | |||||||
| 
 | 
 | ||||||
|     - name: Run Elasticsearch |     - name: Run Elasticsearch | ||||||
|       run: | |       run: | | ||||||
|         docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2 |         docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.17.6 | ||||||
| 
 | 
 | ||||||
|     - name: Run Opensearch |     - name: Run Opensearch | ||||||
|       run: | |       run: | | ||||||
|  | |||||||
| @ -373,7 +373,7 @@ class ElasticsearchDocumentStore(SearchEngineDocumentStore): | |||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         try: |         try: | ||||||
|             result = self.client.search(index=index, body=body, request_timeout=300, headers=headers)["hits"]["hits"] |             result = self.client.search(index=index, **body, request_timeout=300, headers=headers)["hits"]["hits"] | ||||||
|             if len(result) == 0: |             if len(result) == 0: | ||||||
|                 count_documents = self.get_document_count(index=index, headers=headers) |                 count_documents = self.get_document_count(index=index, headers=headers) | ||||||
|                 if count_documents == 0: |                 if count_documents == 0: | ||||||
| @ -454,7 +454,7 @@ class ElasticsearchDocumentStore(SearchEngineDocumentStore): | |||||||
|                 } |                 } | ||||||
| 
 | 
 | ||||||
|         try: |         try: | ||||||
|             self.client.indices.create(index=index_name, body=mapping, headers=headers) |             self.client.indices.create(index=index_name, **mapping, headers=headers) | ||||||
|         except self._RequestError as e: |         except self._RequestError as e: | ||||||
|             # With multiple workers we need to avoid race conditions, where: |             # With multiple workers we need to avoid race conditions, where: | ||||||
|             # - there's no index in the beginning |             # - there's no index in the beginning | ||||||
| @ -483,7 +483,7 @@ class ElasticsearchDocumentStore(SearchEngineDocumentStore): | |||||||
|             } |             } | ||||||
|         } |         } | ||||||
|         try: |         try: | ||||||
|             self.client.indices.create(index=index_name, body=mapping, headers=headers) |             self.client.indices.create(index=index_name, **mapping, headers=headers) | ||||||
|         except self._RequestError as e: |         except self._RequestError as e: | ||||||
|             # With multiple workers we need to avoid race conditions, where: |             # With multiple workers we need to avoid race conditions, where: | ||||||
|             # - there's no index in the beginning |             # - there's no index in the beginning | ||||||
| @ -496,7 +496,7 @@ class ElasticsearchDocumentStore(SearchEngineDocumentStore): | |||||||
|         """ |         """ | ||||||
|         Validates an existing document index. If there's no embedding field, we'll add it. |         Validates an existing document index. If there's no embedding field, we'll add it. | ||||||
|         """ |         """ | ||||||
|         indices = self.client.indices.get(index_name, headers=headers) |         indices = self.client.indices.get(index=index_name, headers=headers) | ||||||
| 
 | 
 | ||||||
|         if not any(indices): |         if not any(indices): | ||||||
|             logger.warning( |             logger.warning( | ||||||
|  | |||||||
| @ -1339,3 +1339,123 @@ class OpenSearchDocumentStore(SearchEngineDocumentStore): | |||||||
|                     progress_bar.update(batch_size) |                     progress_bar.update(batch_size) | ||||||
|         finally: |         finally: | ||||||
|             opensearch_logger.setLevel(original_log_level) |             opensearch_logger.setLevel(original_log_level) | ||||||
|  | 
 | ||||||
|  |     def get_metadata_values_by_key( | ||||||
|  |         self, | ||||||
|  |         key: str, | ||||||
|  |         query: Optional[str] = None, | ||||||
|  |         filters: Optional[FilterType] = None, | ||||||
|  |         index: Optional[str] = None, | ||||||
|  |         headers: Optional[Dict[str, str]] = None, | ||||||
|  |     ) -> List[dict]: | ||||||
|  |         """ | ||||||
|  |         Get values associated with a metadata key. The output is in the format: | ||||||
|  |             [{"value": "my-value-1", "count": 23}, {"value": "my-value-2", "count": 12}, ... ] | ||||||
|  | 
 | ||||||
|  |         :param key: The meta key name to get the values for. | ||||||
|  |         :param query: Narrow down the scope to documents matching the query string. | ||||||
|  |         :param filters: Narrow down the scope to documents matching the given filters. | ||||||
|  |                         Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical | ||||||
|  |                         operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, | ||||||
|  |                         `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. | ||||||
|  |                         Logical operator keys take a dictionary of metadata field names and/or logical operators as | ||||||
|  |                         value. Metadata field names take a dictionary of comparison operators as value. Comparison | ||||||
|  |                         operator keys take a single value or (in case of `"$in"`) a list of values as value. | ||||||
|  |                         If no logical operator is provided, `"$and"` is used as default operation. If no comparison | ||||||
|  |                         operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default | ||||||
|  |                         operation. | ||||||
|  | 
 | ||||||
|  |                             __Example__: | ||||||
|  | 
 | ||||||
|  |                             ```python | ||||||
|  |                             filters = { | ||||||
|  |                                 "$and": { | ||||||
|  |                                     "type": {"$eq": "article"}, | ||||||
|  |                                     "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, | ||||||
|  |                                     "rating": {"$gte": 3}, | ||||||
|  |                                     "$or": { | ||||||
|  |                                         "genre": {"$in": ["economy", "politics"]}, | ||||||
|  |                                         "publisher": {"$eq": "nytimes"} | ||||||
|  |                                     } | ||||||
|  |                                 } | ||||||
|  |                             } | ||||||
|  |                             ``` | ||||||
|  |         :param index: The search index to search for the meta values. If not supplied, | ||||||
|  |                       self.index is used. | ||||||
|  |         :param headers: Custom HTTP headers to pass to the client (for example, {'Authorization': 'Basic YWRtaW46cm9vdA=='}) | ||||||
|  |                 Check out [Elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html) for more information. | ||||||
|  |         """ | ||||||
|  |         body: dict = { | ||||||
|  |             "size": 0, | ||||||
|  |             "aggs": {"metadata_agg": {"composite": {"sources": [{key: {"terms": {"field": key}}}]}}}, | ||||||
|  |         } | ||||||
|  |         if query: | ||||||
|  |             body["query"] = { | ||||||
|  |                 "bool": { | ||||||
|  |                     "should": [{"multi_match": {"query": query, "type": "most_fields", "fields": self.search_fields}}] | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         if filters: | ||||||
|  |             if not body.get("query"): | ||||||
|  |                 body["query"] = {"bool": {}} | ||||||
|  |             body["query"]["bool"].update({"filter": LogicalFilterClause.parse(filters).convert_to_elasticsearch()}) | ||||||
|  |         result = self.client.search(body=body, index=index, headers=headers) | ||||||
|  | 
 | ||||||
|  |         values = [] | ||||||
|  |         current_buckets = result["aggregations"]["metadata_agg"]["buckets"] | ||||||
|  |         after_key = result["aggregations"]["metadata_agg"].get("after_key", False) | ||||||
|  |         for bucket in current_buckets: | ||||||
|  |             values.append({"value": bucket["key"][key], "count": bucket["doc_count"]}) | ||||||
|  | 
 | ||||||
|  |         # Only 10 results get returned at a time, so apply pagination | ||||||
|  |         while after_key: | ||||||
|  |             body["aggs"]["metadata_agg"]["composite"]["after"] = after_key | ||||||
|  |             result = self.client.search(body=body, index=index, headers=headers) | ||||||
|  |             current_buckets = result["aggregations"]["metadata_agg"]["buckets"] | ||||||
|  |             after_key = result["aggregations"]["metadata_agg"].get("after_key", False) | ||||||
|  |             for bucket in current_buckets: | ||||||
|  |                 values.append({"value": bucket["key"][key], "count": bucket["doc_count"]}) | ||||||
|  | 
 | ||||||
|  |         return values | ||||||
|  | 
 | ||||||
|  |     def get_documents_by_id( | ||||||
|  |         self, | ||||||
|  |         ids: List[str], | ||||||
|  |         index: Optional[str] = None, | ||||||
|  |         batch_size: int = 10_000, | ||||||
|  |         headers: Optional[Dict[str, str]] = None, | ||||||
|  |     ) -> List[Document]: | ||||||
|  |         """ | ||||||
|  |         Fetch documents by specifying a list of text ID strings. | ||||||
|  | 
 | ||||||
|  |         :param ids: List of document IDs. Be aware that passing a large number of IDs might lead to performance issues. | ||||||
|  |         :param index: The search index where the documents are stored. If not supplied, | ||||||
|  |                       self.index is used. | ||||||
|  |         :param batch_size: Maximum number of results for each query. | ||||||
|  |                            Limited to 10,000 documents by default. | ||||||
|  |                            To reduce the pressure on the cluster, you can lower this limit at the expense | ||||||
|  |                            of longer retrieval times. | ||||||
|  |         :param headers: Custom HTTP headers to pass to the client (for example, {'Authorization': 'Basic YWRtaW46cm9vdA=='}) | ||||||
|  |                         Check out [Elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html) for more information. | ||||||
|  |         """ | ||||||
|  |         index = index or self.index | ||||||
|  |         documents = [] | ||||||
|  |         for i in range(0, len(ids), batch_size): | ||||||
|  |             ids_for_batch = ids[i : i + batch_size] | ||||||
|  |             query = {"size": len(ids_for_batch), "query": {"ids": {"values": ids_for_batch}}} | ||||||
|  |             if not self.return_embedding and self.embedding_field: | ||||||
|  |                 query["_source"] = {"excludes": [self.embedding_field]} | ||||||
|  |             result = self.client.search(index=index, body=query, headers=headers)["hits"]["hits"] | ||||||
|  |             documents.extend([self._convert_es_hit_to_document(hit) for hit in result]) | ||||||
|  |         return documents | ||||||
|  | 
 | ||||||
|  |     def update_document_meta( | ||||||
|  |         self, id: str, meta: Dict[str, str], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None | ||||||
|  |     ): | ||||||
|  |         """ | ||||||
|  |         Update the metadata dictionary of a document by specifying its ID string. | ||||||
|  |         """ | ||||||
|  |         if not index: | ||||||
|  |             index = self.index | ||||||
|  |         body = {"doc": meta} | ||||||
|  |         self.client.update(index=index, id=id, body=body, refresh=self.refresh_type, headers=headers) | ||||||
|  | |||||||
| @ -166,7 +166,7 @@ class SearchEngineDocumentStore(KeywordDocumentStore): | |||||||
|             if self.client.indices.exists_alias(name=index_name): |             if self.client.indices.exists_alias(name=index_name): | ||||||
|                 logger.debug("Index name %s is an alias.", index_name) |                 logger.debug("Index name %s is an alias.", index_name) | ||||||
| 
 | 
 | ||||||
|         return self.client.indices.exists(index_name, headers=headers) |         return self.client.indices.exists(index=index_name, headers=headers) | ||||||
| 
 | 
 | ||||||
|     @abstractmethod |     @abstractmethod | ||||||
|     def _validate_and_adjust_document_index(self, index_name: str, headers: Optional[Dict[str, str]] = None): |     def _validate_and_adjust_document_index(self, index_name: str, headers: Optional[Dict[str, str]] = None): | ||||||
| @ -281,7 +281,7 @@ class SearchEngineDocumentStore(KeywordDocumentStore): | |||||||
|             query = {"size": len(ids_for_batch), "query": {"ids": {"values": ids_for_batch}}} |             query = {"size": len(ids_for_batch), "query": {"ids": {"values": ids_for_batch}}} | ||||||
|             if not self.return_embedding and self.embedding_field: |             if not self.return_embedding and self.embedding_field: | ||||||
|                 query["_source"] = {"excludes": [self.embedding_field]} |                 query["_source"] = {"excludes": [self.embedding_field]} | ||||||
|             result = self.client.search(index=index, body=query, headers=headers)["hits"]["hits"] |             result = self.client.search(index=index, **query, headers=headers)["hits"]["hits"] | ||||||
|             documents.extend([self._convert_es_hit_to_document(hit) for hit in result]) |             documents.extend([self._convert_es_hit_to_document(hit) for hit in result]) | ||||||
|         return documents |         return documents | ||||||
| 
 | 
 | ||||||
| @ -344,7 +344,7 @@ class SearchEngineDocumentStore(KeywordDocumentStore): | |||||||
|             if not body.get("query"): |             if not body.get("query"): | ||||||
|                 body["query"] = {"bool": {}} |                 body["query"] = {"bool": {}} | ||||||
|             body["query"]["bool"].update({"filter": LogicalFilterClause.parse(filters).convert_to_elasticsearch()}) |             body["query"]["bool"].update({"filter": LogicalFilterClause.parse(filters).convert_to_elasticsearch()}) | ||||||
|         result = self.client.search(body=body, index=index, headers=headers) |         result = self.client.search(**body, index=index, headers=headers) | ||||||
| 
 | 
 | ||||||
|         values = [] |         values = [] | ||||||
|         current_buckets = result["aggregations"]["metadata_agg"]["buckets"] |         current_buckets = result["aggregations"]["metadata_agg"]["buckets"] | ||||||
| @ -355,7 +355,7 @@ class SearchEngineDocumentStore(KeywordDocumentStore): | |||||||
|         # Only 10 results get returned at a time, so apply pagination |         # Only 10 results get returned at a time, so apply pagination | ||||||
|         while after_key: |         while after_key: | ||||||
|             body["aggs"]["metadata_agg"]["composite"]["after"] = after_key |             body["aggs"]["metadata_agg"]["composite"]["after"] = after_key | ||||||
|             result = self.client.search(body=body, index=index, headers=headers) |             result = self.client.search(**body, index=index, headers=headers) | ||||||
|             current_buckets = result["aggregations"]["metadata_agg"]["buckets"] |             current_buckets = result["aggregations"]["metadata_agg"]["buckets"] | ||||||
|             after_key = result["aggregations"]["metadata_agg"].get("after_key", False) |             after_key = result["aggregations"]["metadata_agg"].get("after_key", False) | ||||||
|             for bucket in current_buckets: |             for bucket in current_buckets: | ||||||
| @ -521,7 +521,7 @@ class SearchEngineDocumentStore(KeywordDocumentStore): | |||||||
|         if not index: |         if not index: | ||||||
|             index = self.index |             index = self.index | ||||||
|         body = {"doc": meta} |         body = {"doc": meta} | ||||||
|         self.client.update(index=index, id=id, body=body, refresh=self.refresh_type, headers=headers) |         self.client.update(index=index, id=id, **body, refresh=self.refresh_type, headers=headers) | ||||||
| 
 | 
 | ||||||
|     def get_document_count( |     def get_document_count( | ||||||
|         self, |         self, | ||||||
| @ -908,7 +908,7 @@ class SearchEngineDocumentStore(KeywordDocumentStore): | |||||||
|             all_terms_must_match=all_terms_must_match, |             all_terms_must_match=all_terms_must_match, | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         result = self.client.search(index=index, body=body, headers=headers)["hits"]["hits"] |         result = self.client.search(index=index, **body, headers=headers)["hits"]["hits"] | ||||||
| 
 | 
 | ||||||
|         documents = [self._convert_es_hit_to_document(hit, scale_score=scale_score) for hit in result] |         documents = [self._convert_es_hit_to_document(hit, scale_score=scale_score) for hit in result] | ||||||
|         return documents |         return documents | ||||||
| @ -1542,7 +1542,7 @@ class SearchEngineDocumentStore(KeywordDocumentStore): | |||||||
|         self.client.delete_by_query(index=index, body=query, ignore=[404], headers=headers) |         self.client.delete_by_query(index=index, body=query, ignore=[404], headers=headers) | ||||||
|         # We want to be sure that all docs are deleted before continuing (delete_by_query doesn't support wait_for) |         # We want to be sure that all docs are deleted before continuing (delete_by_query doesn't support wait_for) | ||||||
|         if self.refresh_type == "wait_for": |         if self.refresh_type == "wait_for": | ||||||
|             time.sleep(2) |             self.client.indices.refresh(index=index) | ||||||
| 
 | 
 | ||||||
|     def delete_labels( |     def delete_labels( | ||||||
|         self, |         self, | ||||||
|  | |||||||
| @ -75,7 +75,7 @@ dependencies = [ | |||||||
|   "sentence-transformers>=2.2.0", |   "sentence-transformers>=2.2.0", | ||||||
| 
 | 
 | ||||||
|   # Elasticsearch |   # Elasticsearch | ||||||
|   "elasticsearch>=7.7,<8", |   "elasticsearch>=7.17,<8", | ||||||
| 
 | 
 | ||||||
|   # OpenAI tokenizer |   # OpenAI tokenizer | ||||||
|   "tiktoken>=0.3.0; python_version >= '3.8' and (platform_machine == 'AMD64' or platform_machine == 'amd64' or platform_machine == 'x86_64' or (platform_machine == 'arm64' and platform_system == 'Darwin'))", |   "tiktoken>=0.3.0; python_version >= '3.8' and (platform_machine == 'AMD64' or platform_machine == 'amd64' or platform_machine == 'x86_64' or (platform_machine == 'arm64' and platform_system == 'Darwin'))", | ||||||
|  | |||||||
| @ -22,7 +22,8 @@ class TestElasticsearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngine | |||||||
|     @pytest.fixture |     @pytest.fixture | ||||||
|     def ds(self): |     def ds(self): | ||||||
|         """ |         """ | ||||||
|         This fixture provides a working document store and takes care of removing the indices when done |         This fixture provides a working document store and takes care of keeping clean | ||||||
|  |         the ES cluster used in the tests. | ||||||
|         """ |         """ | ||||||
|         labels_index_name = f"{self.index_name}_labels" |         labels_index_name = f"{self.index_name}_labels" | ||||||
|         ds = ElasticsearchDocumentStore( |         ds = ElasticsearchDocumentStore( | ||||||
| @ -30,10 +31,10 @@ class TestElasticsearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngine | |||||||
|             label_index=labels_index_name, |             label_index=labels_index_name, | ||||||
|             host=os.environ.get("ELASTICSEARCH_HOST", "localhost"), |             host=os.environ.get("ELASTICSEARCH_HOST", "localhost"), | ||||||
|             create_index=True, |             create_index=True, | ||||||
|  |             recreate_index=True, | ||||||
|         ) |         ) | ||||||
|  | 
 | ||||||
|         yield ds |         yield ds | ||||||
|         ds.delete_index(self.index_name) |  | ||||||
|         ds.delete_index(labels_index_name) |  | ||||||
| 
 | 
 | ||||||
|     @pytest.fixture |     @pytest.fixture | ||||||
|     def mocked_elastic_search_init(self, monkeypatch): |     def mocked_elastic_search_init(self, monkeypatch): | ||||||
| @ -213,8 +214,8 @@ class TestElasticsearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngine | |||||||
| 
 | 
 | ||||||
|         settings = {"mappings": {"properties": {"content": {"type": "text"}}}} |         settings = {"mappings": {"properties": {"content": {"type": "text"}}}} | ||||||
| 
 | 
 | ||||||
|         client.indices.create(index="haystack_existing_alias_1", body=settings) |         client.indices.create(index="haystack_existing_alias_1", **settings) | ||||||
|         client.indices.create(index="haystack_existing_alias_2", body=settings) |         client.indices.create(index="haystack_existing_alias_2", **settings) | ||||||
| 
 | 
 | ||||||
|         client.indices.put_alias( |         client.indices.put_alias( | ||||||
|             index="haystack_existing_alias_1,haystack_existing_alias_2", name="haystack_existing_alias" |             index="haystack_existing_alias_1,haystack_existing_alias_2", name="haystack_existing_alias" | ||||||
| @ -233,8 +234,8 @@ class TestElasticsearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngine | |||||||
|         right_settings = {"mappings": {"properties": {"content": {"type": "text"}}}} |         right_settings = {"mappings": {"properties": {"content": {"type": "text"}}}} | ||||||
|         wrong_settings = {"mappings": {"properties": {"content": {"type": "histogram"}}}} |         wrong_settings = {"mappings": {"properties": {"content": {"type": "histogram"}}}} | ||||||
| 
 | 
 | ||||||
|         client.indices.create(index="haystack_existing_alias_1", body=right_settings) |         client.indices.create(index="haystack_existing_alias_1", **right_settings) | ||||||
|         client.indices.create(index="haystack_existing_alias_2", body=wrong_settings) |         client.indices.create(index="haystack_existing_alias_2", **wrong_settings) | ||||||
|         client.indices.put_alias( |         client.indices.put_alias( | ||||||
|             index="haystack_existing_alias_1,haystack_existing_alias_2", name="haystack_existing_alias" |             index="haystack_existing_alias_1,haystack_existing_alias_2", name="haystack_existing_alias" | ||||||
|         ) |         ) | ||||||
| @ -326,3 +327,20 @@ class TestElasticsearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngine | |||||||
|                 username="", aws4auth="foo", **_init_client_remaining_kwargs |                 username="", aws4auth="foo", **_init_client_remaining_kwargs | ||||||
|             ) |             ) | ||||||
|         assert len(caplog.records) == 0 |         assert len(caplog.records) == 0 | ||||||
|  | 
 | ||||||
|  |     @pytest.mark.unit | ||||||
|  |     def test_get_document_by_id_return_embedding_false(self, mocked_document_store): | ||||||
|  |         mocked_document_store.return_embedding = False | ||||||
|  |         mocked_document_store.get_document_by_id("123") | ||||||
|  |         # assert the resulting body is consistent with the `excluded_meta_data` value | ||||||
|  |         _, kwargs = mocked_document_store.client.search.call_args | ||||||
|  |         assert kwargs["_source"] == {"excludes": ["embedding"]} | ||||||
|  | 
 | ||||||
|  |     @pytest.mark.unit | ||||||
|  |     def test_get_document_by_id_excluded_meta_data_has_no_influence(self, mocked_document_store): | ||||||
|  |         mocked_document_store.excluded_meta_data = ["foo"] | ||||||
|  |         mocked_document_store.return_embedding = False | ||||||
|  |         mocked_document_store.get_document_by_id("123") | ||||||
|  |         # assert the resulting body is not affected by the `excluded_meta_data` value | ||||||
|  |         _, kwargs = mocked_document_store.client.search.call_args | ||||||
|  |         assert kwargs["_source"] == {"excludes": ["embedding"]} | ||||||
|  | |||||||
| @ -32,7 +32,8 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc | |||||||
|     @pytest.fixture |     @pytest.fixture | ||||||
|     def ds(self): |     def ds(self): | ||||||
|         """ |         """ | ||||||
|         This fixture provides a working document store and takes care of removing the indices when done |         This fixture provides a working document store and takes care of keeping clean the | ||||||
|  |         OS cluster used in the tests. | ||||||
|         """ |         """ | ||||||
|         labels_index_name = f"{self.index_name}_labels" |         labels_index_name = f"{self.index_name}_labels" | ||||||
|         ds = OpenSearchDocumentStore( |         ds = OpenSearchDocumentStore( | ||||||
| @ -40,10 +41,10 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc | |||||||
|             label_index=labels_index_name, |             label_index=labels_index_name, | ||||||
|             host=os.environ.get("OPENSEARCH_HOST", "localhost"), |             host=os.environ.get("OPENSEARCH_HOST", "localhost"), | ||||||
|             create_index=True, |             create_index=True, | ||||||
|  |             recreate_index=True, | ||||||
|         ) |         ) | ||||||
|  | 
 | ||||||
|         yield ds |         yield ds | ||||||
|         ds.delete_index(self.index_name) |  | ||||||
|         ds.delete_index(labels_index_name) |  | ||||||
| 
 | 
 | ||||||
|     @pytest.fixture |     @pytest.fixture | ||||||
|     def mocked_document_store(self, existing_index): |     def mocked_document_store(self, existing_index): | ||||||
| @ -1239,3 +1240,20 @@ class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDoc | |||||||
|             ] |             ] | ||||||
|             mocked_document_store._bulk(documents=docs_to_write, _timeout=0, _remaining_tries=3) |             mocked_document_store._bulk(documents=docs_to_write, _timeout=0, _remaining_tries=3) | ||||||
|             assert mocked_bulk.call_count == 5 |             assert mocked_bulk.call_count == 5 | ||||||
|  | 
 | ||||||
|  |     @pytest.mark.unit | ||||||
|  |     def test_get_document_by_id_return_embedding_false(self, mocked_document_store): | ||||||
|  |         mocked_document_store.return_embedding = False | ||||||
|  |         mocked_document_store.get_document_by_id("123") | ||||||
|  |         # assert the resulting body is consistent with the `excluded_meta_data` value | ||||||
|  |         _, kwargs = mocked_document_store.client.search.call_args | ||||||
|  |         assert kwargs["body"]["_source"] == {"excludes": ["embedding"]} | ||||||
|  | 
 | ||||||
|  |     @pytest.mark.unit | ||||||
|  |     def test_get_document_by_id_excluded_meta_data_has_no_influence(self, mocked_document_store): | ||||||
|  |         mocked_document_store.excluded_meta_data = ["foo"] | ||||||
|  |         mocked_document_store.return_embedding = False | ||||||
|  |         mocked_document_store.get_document_by_id("123") | ||||||
|  |         # assert the resulting body is not affected by the `excluded_meta_data` value | ||||||
|  |         _, kwargs = mocked_document_store.client.search.call_args | ||||||
|  |         assert kwargs["body"]["_source"] == {"excludes": ["embedding"]} | ||||||
|  | |||||||
| @ -62,7 +62,7 @@ class SearchEngineDocumentStoreTestAbstract: | |||||||
|         mocked_document_store.query(self.query) |         mocked_document_store.query(self.query) | ||||||
|         # assert the resulting body is consistent with the `excluded_meta_data` value |         # assert the resulting body is consistent with the `excluded_meta_data` value | ||||||
|         _, kwargs = mocked_document_store.client.search.call_args |         _, kwargs = mocked_document_store.client.search.call_args | ||||||
|         assert "_source" not in kwargs["body"] |         assert "_source" not in kwargs | ||||||
| 
 | 
 | ||||||
|     @pytest.mark.unit |     @pytest.mark.unit | ||||||
|     def test_query_return_embedding_false(self, mocked_document_store): |     def test_query_return_embedding_false(self, mocked_document_store): | ||||||
| @ -70,7 +70,7 @@ class SearchEngineDocumentStoreTestAbstract: | |||||||
|         mocked_document_store.query(self.query) |         mocked_document_store.query(self.query) | ||||||
|         # assert the resulting body is consistent with the `excluded_meta_data` value |         # assert the resulting body is consistent with the `excluded_meta_data` value | ||||||
|         _, kwargs = mocked_document_store.client.search.call_args |         _, kwargs = mocked_document_store.client.search.call_args | ||||||
|         assert kwargs["body"]["_source"] == {"excludes": ["embedding"]} |         assert kwargs["_source"] == {"excludes": ["embedding"]} | ||||||
| 
 | 
 | ||||||
|     @pytest.mark.unit |     @pytest.mark.unit | ||||||
|     def test_query_excluded_meta_data_return_embedding_true(self, mocked_document_store): |     def test_query_excluded_meta_data_return_embedding_true(self, mocked_document_store): | ||||||
| @ -79,7 +79,7 @@ class SearchEngineDocumentStoreTestAbstract: | |||||||
|         mocked_document_store.query(self.query) |         mocked_document_store.query(self.query) | ||||||
|         _, kwargs = mocked_document_store.client.search.call_args |         _, kwargs = mocked_document_store.client.search.call_args | ||||||
|         # we expect "embedding" was removed from the final query |         # we expect "embedding" was removed from the final query | ||||||
|         assert kwargs["body"]["_source"] == {"excludes": ["foo"]} |         assert kwargs["_source"] == {"excludes": ["foo"]} | ||||||
| 
 | 
 | ||||||
|     @pytest.mark.unit |     @pytest.mark.unit | ||||||
|     def test_query_excluded_meta_data_return_embedding_false(self, mocked_document_store): |     def test_query_excluded_meta_data_return_embedding_false(self, mocked_document_store): | ||||||
| @ -88,7 +88,7 @@ class SearchEngineDocumentStoreTestAbstract: | |||||||
|         mocked_document_store.query(self.query) |         mocked_document_store.query(self.query) | ||||||
|         # assert the resulting body is consistent with the `excluded_meta_data` value |         # assert the resulting body is consistent with the `excluded_meta_data` value | ||||||
|         _, kwargs = mocked_document_store.client.search.call_args |         _, kwargs = mocked_document_store.client.search.call_args | ||||||
|         assert kwargs["body"]["_source"] == {"excludes": ["foo", "embedding"]} |         assert kwargs["_source"] == {"excludes": ["foo", "embedding"]} | ||||||
| 
 | 
 | ||||||
|     @pytest.mark.unit |     @pytest.mark.unit | ||||||
|     def test_get_all_documents_return_embedding_true(self, mocked_document_store): |     def test_get_all_documents_return_embedding_true(self, mocked_document_store): | ||||||
| @ -97,10 +97,7 @@ class SearchEngineDocumentStoreTestAbstract: | |||||||
|         mocked_document_store.get_all_documents(return_embedding=True) |         mocked_document_store.get_all_documents(return_embedding=True) | ||||||
|         # assert the resulting body is consistent with the `excluded_meta_data` value |         # assert the resulting body is consistent with the `excluded_meta_data` value | ||||||
|         _, kwargs = mocked_document_store.client.search.call_args |         _, kwargs = mocked_document_store.client.search.call_args | ||||||
|         # starting with elasticsearch client 7.16, scan() uses the query parameter instead of body, |         assert "_source" not in kwargs | ||||||
|         # see https://github.com/elastic/elasticsearch-py/commit/889edc9ad6d728b79fadf790238b79f36449d2e2 |  | ||||||
|         body = kwargs.get("body", kwargs) |  | ||||||
|         assert "_source" not in body |  | ||||||
| 
 | 
 | ||||||
|     @pytest.mark.unit |     @pytest.mark.unit | ||||||
|     def test_get_all_documents_return_embedding_false(self, mocked_document_store): |     def test_get_all_documents_return_embedding_false(self, mocked_document_store): | ||||||
| @ -132,24 +129,7 @@ class SearchEngineDocumentStoreTestAbstract: | |||||||
|         mocked_document_store.get_document_by_id("123") |         mocked_document_store.get_document_by_id("123") | ||||||
|         # assert the resulting body is consistent with the `excluded_meta_data` value |         # assert the resulting body is consistent with the `excluded_meta_data` value | ||||||
|         _, kwargs = mocked_document_store.client.search.call_args |         _, kwargs = mocked_document_store.client.search.call_args | ||||||
|         assert "_source" not in kwargs["body"] |         assert "_source" not in kwargs | ||||||
| 
 |  | ||||||
|     @pytest.mark.unit |  | ||||||
|     def test_get_document_by_id_return_embedding_false(self, mocked_document_store): |  | ||||||
|         mocked_document_store.return_embedding = False |  | ||||||
|         mocked_document_store.get_document_by_id("123") |  | ||||||
|         # assert the resulting body is consistent with the `excluded_meta_data` value |  | ||||||
|         _, kwargs = mocked_document_store.client.search.call_args |  | ||||||
|         assert kwargs["body"]["_source"] == {"excludes": ["embedding"]} |  | ||||||
| 
 |  | ||||||
|     @pytest.mark.unit |  | ||||||
|     def test_get_document_by_id_excluded_meta_data_has_no_influence(self, mocked_document_store): |  | ||||||
|         mocked_document_store.excluded_meta_data = ["foo"] |  | ||||||
|         mocked_document_store.return_embedding = False |  | ||||||
|         mocked_document_store.get_document_by_id("123") |  | ||||||
|         # assert the resulting body is not affected by the `excluded_meta_data` value |  | ||||||
|         _, kwargs = mocked_document_store.client.search.call_args |  | ||||||
|         assert kwargs["body"]["_source"] == {"excludes": ["embedding"]} |  | ||||||
| 
 | 
 | ||||||
|     @pytest.mark.unit |     @pytest.mark.unit | ||||||
|     def test_get_all_labels_legacy_document_id(self, mocked_document_store, mocked_get_all_documents_in_index): |     def test_get_all_labels_legacy_document_id(self, mocked_document_store, mocked_get_all_documents_in_index): | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Massimiliano Pippi
						Massimiliano Pippi