| 
									
										
										
										
											2024-09-30 15:38:43 +08:00
										 |  |  | import json | 
					
						
							|  |  |  | from copy import deepcopy | 
					
						
							|  |  |  | from datetime import datetime, timezone | 
					
						
							|  |  |  | from typing import Any, Optional, Union | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import httpx | 
					
						
							|  |  |  | import validators | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-28 18:37:35 +08:00
										 |  |  | from constants import HIDDEN_VALUE | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-30 15:38:43 +08:00
										 |  |  | # from tasks.external_document_indexing_task import external_document_indexing_task | 
					
						
							|  |  |  | from core.helper import ssrf_proxy | 
					
						
							|  |  |  | from extensions.ext_database import db | 
					
						
							|  |  |  | from models.dataset import ( | 
					
						
							|  |  |  |     Dataset, | 
					
						
							|  |  |  |     ExternalKnowledgeApis, | 
					
						
							|  |  |  |     ExternalKnowledgeBindings, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | from services.entities.external_knowledge_entities.external_knowledge_entities import ( | 
					
						
							|  |  |  |     Authorization, | 
					
						
							|  |  |  |     ExternalKnowledgeApiSetting, | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | from services.errors.dataset import DatasetNameDuplicateError | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class ExternalDatasetService: | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def get_external_knowledge_apis(page, per_page, tenant_id, search=None) -> tuple[list[ExternalKnowledgeApis], int]: | 
					
						
							|  |  |  |         query = ExternalKnowledgeApis.query.filter(ExternalKnowledgeApis.tenant_id == tenant_id).order_by( | 
					
						
							|  |  |  |             ExternalKnowledgeApis.created_at.desc() | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         if search: | 
					
						
							|  |  |  |             query = query.filter(ExternalKnowledgeApis.name.ilike(f"%{search}%")) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         external_knowledge_apis = query.paginate(page=page, per_page=per_page, max_per_page=100, error_out=False) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return external_knowledge_apis.items, external_knowledge_apis.total | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @classmethod | 
					
						
							|  |  |  |     def validate_api_list(cls, api_settings: dict): | 
					
						
							|  |  |  |         if not api_settings: | 
					
						
							|  |  |  |             raise ValueError("api list is empty") | 
					
						
							|  |  |  |         if "endpoint" not in api_settings and not api_settings["endpoint"]: | 
					
						
							|  |  |  |             raise ValueError("endpoint is required") | 
					
						
							|  |  |  |         if "api_key" not in api_settings and not api_settings["api_key"]: | 
					
						
							|  |  |  |             raise ValueError("api_key is required") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def create_external_knowledge_api(tenant_id: str, user_id: str, args: dict) -> ExternalKnowledgeApis: | 
					
						
							|  |  |  |         ExternalDatasetService.check_endpoint_and_api_key(args.get("settings")) | 
					
						
							|  |  |  |         external_knowledge_api = ExternalKnowledgeApis( | 
					
						
							|  |  |  |             tenant_id=tenant_id, | 
					
						
							|  |  |  |             created_by=user_id, | 
					
						
							|  |  |  |             updated_by=user_id, | 
					
						
							|  |  |  |             name=args.get("name"), | 
					
						
							|  |  |  |             description=args.get("description", ""), | 
					
						
							|  |  |  |             settings=json.dumps(args.get("settings"), ensure_ascii=False), | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         db.session.add(external_knowledge_api) | 
					
						
							|  |  |  |         db.session.commit() | 
					
						
							|  |  |  |         return external_knowledge_api | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def check_endpoint_and_api_key(settings: dict): | 
					
						
							|  |  |  |         if "endpoint" not in settings or not settings["endpoint"]: | 
					
						
							|  |  |  |             raise ValueError("endpoint is required") | 
					
						
							|  |  |  |         if "api_key" not in settings or not settings["api_key"]: | 
					
						
							|  |  |  |             raise ValueError("api_key is required") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         endpoint = f"{settings['endpoint']}/retrieval" | 
					
						
							|  |  |  |         api_key = settings["api_key"] | 
					
						
							| 
									
										
										
										
											2024-10-29 10:33:15 +08:00
										 |  |  |         if not validators.url(endpoint, simple_host=True): | 
					
						
							| 
									
										
										
										
											2024-09-30 15:38:43 +08:00
										 |  |  |             raise ValueError(f"invalid endpoint: {endpoint}") | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             response = httpx.post(endpoint, headers={"Authorization": f"Bearer {api_key}"}) | 
					
						
							|  |  |  |         except Exception as e: | 
					
						
							|  |  |  |             raise ValueError(f"failed to connect to the endpoint: {endpoint}") | 
					
						
							|  |  |  |         if response.status_code == 502: | 
					
						
							|  |  |  |             raise ValueError(f"Bad Gateway: failed to connect to the endpoint: {endpoint}") | 
					
						
							|  |  |  |         if response.status_code == 404: | 
					
						
							|  |  |  |             raise ValueError(f"Not Found: failed to connect to the endpoint: {endpoint}") | 
					
						
							|  |  |  |         if response.status_code == 403: | 
					
						
							|  |  |  |             raise ValueError(f"Forbidden: Authorization failed with api_key: {api_key}") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def get_external_knowledge_api(external_knowledge_api_id: str) -> ExternalKnowledgeApis: | 
					
						
							|  |  |  |         return ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id).first() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def update_external_knowledge_api(tenant_id, user_id, external_knowledge_api_id, args) -> ExternalKnowledgeApis: | 
					
						
							|  |  |  |         external_knowledge_api = ExternalKnowledgeApis.query.filter_by( | 
					
						
							|  |  |  |             id=external_knowledge_api_id, tenant_id=tenant_id | 
					
						
							|  |  |  |         ).first() | 
					
						
							|  |  |  |         if external_knowledge_api is None: | 
					
						
							|  |  |  |             raise ValueError("api template not found") | 
					
						
							| 
									
										
										
										
											2024-10-28 18:37:35 +08:00
										 |  |  |         if args.get("settings") and args.get("settings").get("api_key") == HIDDEN_VALUE: | 
					
						
							|  |  |  |             args.get("settings")["api_key"] = external_knowledge_api.settings_dict.get("api_key") | 
					
						
							| 
									
										
										
										
											2024-09-30 15:38:43 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         external_knowledge_api.name = args.get("name") | 
					
						
							|  |  |  |         external_knowledge_api.description = args.get("description", "") | 
					
						
							|  |  |  |         external_knowledge_api.settings = json.dumps(args.get("settings"), ensure_ascii=False) | 
					
						
							|  |  |  |         external_knowledge_api.updated_by = user_id | 
					
						
							|  |  |  |         external_knowledge_api.updated_at = datetime.now(timezone.utc).replace(tzinfo=None) | 
					
						
							|  |  |  |         db.session.commit() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return external_knowledge_api | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def delete_external_knowledge_api(tenant_id: str, external_knowledge_api_id: str): | 
					
						
							|  |  |  |         external_knowledge_api = ExternalKnowledgeApis.query.filter_by( | 
					
						
							|  |  |  |             id=external_knowledge_api_id, tenant_id=tenant_id | 
					
						
							|  |  |  |         ).first() | 
					
						
							|  |  |  |         if external_knowledge_api is None: | 
					
						
							|  |  |  |             raise ValueError("api template not found") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         db.session.delete(external_knowledge_api) | 
					
						
							|  |  |  |         db.session.commit() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def external_knowledge_api_use_check(external_knowledge_api_id: str) -> tuple[bool, int]: | 
					
						
							|  |  |  |         count = ExternalKnowledgeBindings.query.filter_by(external_knowledge_api_id=external_knowledge_api_id).count() | 
					
						
							|  |  |  |         if count > 0: | 
					
						
							|  |  |  |             return True, count | 
					
						
							|  |  |  |         return False, 0 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def get_external_knowledge_binding_with_dataset_id(tenant_id: str, dataset_id: str) -> ExternalKnowledgeBindings: | 
					
						
							|  |  |  |         external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by( | 
					
						
							|  |  |  |             dataset_id=dataset_id, tenant_id=tenant_id | 
					
						
							|  |  |  |         ).first() | 
					
						
							|  |  |  |         if not external_knowledge_binding: | 
					
						
							|  |  |  |             raise ValueError("external knowledge binding not found") | 
					
						
							|  |  |  |         return external_knowledge_binding | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def document_create_args_validate(tenant_id: str, external_knowledge_api_id: str, process_parameter: dict): | 
					
						
							|  |  |  |         external_knowledge_api = ExternalKnowledgeApis.query.filter_by( | 
					
						
							|  |  |  |             id=external_knowledge_api_id, tenant_id=tenant_id | 
					
						
							|  |  |  |         ).first() | 
					
						
							|  |  |  |         if external_knowledge_api is None: | 
					
						
							|  |  |  |             raise ValueError("api template not found") | 
					
						
							|  |  |  |         settings = json.loads(external_knowledge_api.settings) | 
					
						
							|  |  |  |         for setting in settings: | 
					
						
							|  |  |  |             custom_parameters = setting.get("document_process_setting") | 
					
						
							|  |  |  |             if custom_parameters: | 
					
						
							|  |  |  |                 for parameter in custom_parameters: | 
					
						
							|  |  |  |                     if parameter.get("required", False) and not process_parameter.get(parameter.get("name")): | 
					
						
							|  |  |  |                         raise ValueError(f'{parameter.get("name")} is required') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def process_external_api( | 
					
						
							|  |  |  |         settings: ExternalKnowledgeApiSetting, files: Union[None, dict[str, Any]] | 
					
						
							|  |  |  |     ) -> httpx.Response: | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  |         do http request depending on api bundle | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         kwargs = { | 
					
						
							|  |  |  |             "url": settings.url, | 
					
						
							|  |  |  |             "headers": settings.headers, | 
					
						
							|  |  |  |             "follow_redirects": True, | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         response = getattr(ssrf_proxy, settings.request_method)(data=json.dumps(settings.params), files=files, **kwargs) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return response | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def assembling_headers(authorization: Authorization, headers: Optional[dict] = None) -> dict[str, Any]: | 
					
						
							|  |  |  |         authorization = deepcopy(authorization) | 
					
						
							|  |  |  |         if headers: | 
					
						
							|  |  |  |             headers = deepcopy(headers) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             headers = {} | 
					
						
							|  |  |  |         if authorization.type == "api-key": | 
					
						
							|  |  |  |             if authorization.config is None: | 
					
						
							|  |  |  |                 raise ValueError("authorization config is required") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if authorization.config.api_key is None: | 
					
						
							|  |  |  |                 raise ValueError("api_key is required") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if not authorization.config.header: | 
					
						
							|  |  |  |                 authorization.config.header = "Authorization" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             if authorization.config.type == "bearer": | 
					
						
							|  |  |  |                 headers[authorization.config.header] = f"Bearer {authorization.config.api_key}" | 
					
						
							|  |  |  |             elif authorization.config.type == "basic": | 
					
						
							|  |  |  |                 headers[authorization.config.header] = f"Basic {authorization.config.api_key}" | 
					
						
							|  |  |  |             elif authorization.config.type == "custom": | 
					
						
							|  |  |  |                 headers[authorization.config.header] = authorization.config.api_key | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return headers | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def get_external_knowledge_api_settings(settings: dict) -> ExternalKnowledgeApiSetting: | 
					
						
							|  |  |  |         return ExternalKnowledgeApiSetting.parse_obj(settings) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def create_external_dataset(tenant_id: str, user_id: str, args: dict) -> Dataset: | 
					
						
							|  |  |  |         # check if dataset name already exists | 
					
						
							|  |  |  |         if Dataset.query.filter_by(name=args.get("name"), tenant_id=tenant_id).first(): | 
					
						
							|  |  |  |             raise DatasetNameDuplicateError(f"Dataset with name {args.get('name')} already exists.") | 
					
						
							|  |  |  |         external_knowledge_api = ExternalKnowledgeApis.query.filter_by( | 
					
						
							|  |  |  |             id=args.get("external_knowledge_api_id"), tenant_id=tenant_id | 
					
						
							|  |  |  |         ).first() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if external_knowledge_api is None: | 
					
						
							|  |  |  |             raise ValueError("api template not found") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         dataset = Dataset( | 
					
						
							|  |  |  |             tenant_id=tenant_id, | 
					
						
							|  |  |  |             name=args.get("name"), | 
					
						
							|  |  |  |             description=args.get("description", ""), | 
					
						
							|  |  |  |             provider="external", | 
					
						
							|  |  |  |             retrieval_model=args.get("external_retrieval_model"), | 
					
						
							|  |  |  |             created_by=user_id, | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         db.session.add(dataset) | 
					
						
							|  |  |  |         db.session.flush() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         external_knowledge_binding = ExternalKnowledgeBindings( | 
					
						
							|  |  |  |             tenant_id=tenant_id, | 
					
						
							|  |  |  |             dataset_id=dataset.id, | 
					
						
							|  |  |  |             external_knowledge_api_id=args.get("external_knowledge_api_id"), | 
					
						
							|  |  |  |             external_knowledge_id=args.get("external_knowledge_id"), | 
					
						
							|  |  |  |             created_by=user_id, | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         db.session.add(external_knowledge_binding) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         db.session.commit() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return dataset | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     @staticmethod | 
					
						
							|  |  |  |     def fetch_external_knowledge_retrieval( | 
					
						
							|  |  |  |         tenant_id: str, dataset_id: str, query: str, external_retrieval_parameters: dict | 
					
						
							|  |  |  |     ) -> list: | 
					
						
							|  |  |  |         external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by( | 
					
						
							|  |  |  |             dataset_id=dataset_id, tenant_id=tenant_id | 
					
						
							|  |  |  |         ).first() | 
					
						
							|  |  |  |         if not external_knowledge_binding: | 
					
						
							|  |  |  |             raise ValueError("external knowledge binding not found") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         external_knowledge_api = ExternalKnowledgeApis.query.filter_by( | 
					
						
							|  |  |  |             id=external_knowledge_binding.external_knowledge_api_id | 
					
						
							|  |  |  |         ).first() | 
					
						
							|  |  |  |         if not external_knowledge_api: | 
					
						
							|  |  |  |             raise ValueError("external api template not found") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         settings = json.loads(external_knowledge_api.settings) | 
					
						
							|  |  |  |         headers = {"Content-Type": "application/json"} | 
					
						
							|  |  |  |         if settings.get("api_key"): | 
					
						
							|  |  |  |             headers["Authorization"] = f"Bearer {settings.get('api_key')}" | 
					
						
							|  |  |  |         score_threshold_enabled = external_retrieval_parameters.get("score_threshold_enabled") or False | 
					
						
							|  |  |  |         score_threshold = external_retrieval_parameters.get("score_threshold", 0.0) if score_threshold_enabled else 0.0 | 
					
						
							|  |  |  |         request_params = { | 
					
						
							|  |  |  |             "retrieval_setting": { | 
					
						
							|  |  |  |                 "top_k": external_retrieval_parameters.get("top_k"), | 
					
						
							|  |  |  |                 "score_threshold": score_threshold, | 
					
						
							|  |  |  |             }, | 
					
						
							|  |  |  |             "query": query, | 
					
						
							|  |  |  |             "knowledge_id": external_knowledge_binding.external_knowledge_id, | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         external_knowledge_api_setting = { | 
					
						
							|  |  |  |             "url": f"{settings.get('endpoint')}/retrieval", | 
					
						
							|  |  |  |             "request_method": "post", | 
					
						
							|  |  |  |             "headers": headers, | 
					
						
							|  |  |  |             "params": request_params, | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         response = ExternalDatasetService.process_external_api( | 
					
						
							|  |  |  |             ExternalKnowledgeApiSetting(**external_knowledge_api_setting), None | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         if response.status_code == 200: | 
					
						
							|  |  |  |             return response.json().get("records", []) | 
					
						
							|  |  |  |         return [] |