| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | # | 
					
						
							|  |  |  | #  Copyright 2024 The InfiniFlow Authors. All Rights Reserved. | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #  Licensed under the Apache License, Version 2.0 (the "License"); | 
					
						
							|  |  |  | #  you may not use this file except in compliance with the License. | 
					
						
							|  |  |  | #  You may obtain a copy of the License at | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #      http://www.apache.org/licenses/LICENSE-2.0 | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | #  Unless required by applicable law or agreed to in writing, software | 
					
						
							|  |  |  | #  distributed under the License is distributed on an "AS IS" BASIS, | 
					
						
							|  |  |  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
					
						
							|  |  |  | #  See the License for the specific language governing permissions and | 
					
						
							|  |  |  | #  limitations under the License | 
					
						
							|  |  |  | # | 
					
						
							|  |  |  | import pathlib | 
					
						
							|  |  |  | import re | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import flask | 
					
						
							|  |  |  | from flask import request | 
					
						
							|  |  |  | from flask_login import login_required, current_user | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from api.db.db_models import Task, File | 
					
						
							|  |  |  | from api.db.services.file2document_service import File2DocumentService | 
					
						
							|  |  |  | from api.db.services.file_service import FileService | 
					
						
							|  |  |  | from api.db.services.task_service import TaskService, queue_tasks | 
					
						
							| 
									
										
										
										
											2024-10-16 10:17:05 +08:00
										 |  |  | from api.db.services.user_service import UserTenantService | 
					
						
							| 
									
										
										
										
											2024-11-13 12:58:37 +08:00
										 |  |  | from deepdoc.parser.html_parser import RAGFlowHtmlParser | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | from rag.nlp import search | 
					
						
							|  |  |  | from api.db.services import duplicate_name | 
					
						
							|  |  |  | from api.db.services.knowledgebase_service import KnowledgebaseService | 
					
						
							|  |  |  | from api.utils.api_utils import server_error_response, get_data_error_result, validate_request | 
					
						
							|  |  |  | from api.utils import get_uuid | 
					
						
							| 
									
										
										
										
											2024-10-16 10:17:05 +08:00
										 |  |  | from api.db import FileType, TaskStatus, ParserType, FileSource | 
					
						
							| 
									
										
										
										
											2024-08-15 19:30:43 +08:00
										 |  |  | from api.db.services.document_service import DocumentService, doc_upload_and_parse | 
					
						
							| 
									
										
										
										
											2024-11-12 14:59:41 +08:00
										 |  |  | from api.settings import RetCode, docStoreConn | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | from api.utils.api_utils import get_json_result | 
					
						
							| 
									
										
										
										
											2024-09-09 09:41:14 +08:00
										 |  |  | from rag.utils.storage_factory import STORAGE_IMPL | 
					
						
							| 
									
										
										
										
											2024-10-16 10:17:05 +08:00
										 |  |  | from api.utils.file_utils import filename_type, thumbnail | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | from api.utils.web_utils import html2pdf, is_valid_url | 
					
						
							| 
									
										
										
										
											2024-11-11 09:36:39 +08:00
										 |  |  | from api.constants import IMG_BASE64_PREFIX | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @manager.route('/upload', methods=['POST']) | 
					
						
							|  |  |  | @login_required | 
					
						
							|  |  |  | @validate_request("kb_id") | 
					
						
							|  |  |  | def upload(): | 
					
						
							|  |  |  |     kb_id = request.form.get("kb_id") | 
					
						
							|  |  |  |     if not kb_id: | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |     if 'file' not in request.files: | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             data=False, message='No file part!', code=RetCode.ARGUMENT_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     file_objs = request.files.getlist('file') | 
					
						
							|  |  |  |     for file_obj in file_objs: | 
					
						
							|  |  |  |         if file_obj.filename == '': | 
					
						
							|  |  |  |             return get_json_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 data=False, message='No file selected!', code=RetCode.ARGUMENT_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     e, kb = KnowledgebaseService.get_by_id(kb_id) | 
					
						
							|  |  |  |     if not e: | 
					
						
							|  |  |  |         raise LookupError("Can't find this knowledgebase!") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-15 19:30:43 +08:00
										 |  |  |     err, _ = FileService.upload_document(kb, file_objs, current_user.id) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |     if err: | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             data=False, message="\n".join(err), code=RetCode.SERVER_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |     return get_json_result(data=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @manager.route('/web_crawl', methods=['POST']) | 
					
						
							|  |  |  | @login_required | 
					
						
							|  |  |  | @validate_request("kb_id", "name", "url") | 
					
						
							|  |  |  | def web_crawl(): | 
					
						
							|  |  |  |     kb_id = request.form.get("kb_id") | 
					
						
							|  |  |  |     if not kb_id: | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |     name = request.form.get("name") | 
					
						
							|  |  |  |     url = request.form.get("url") | 
					
						
							|  |  |  |     if not is_valid_url(url): | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |     e, kb = KnowledgebaseService.get_by_id(kb_id) | 
					
						
							|  |  |  |     if not e: | 
					
						
							|  |  |  |         raise LookupError("Can't find this knowledgebase!") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     blob = html2pdf(url) | 
					
						
							|  |  |  |     if not blob: return server_error_response(ValueError("Download failure.")) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     root_folder = FileService.get_root_folder(current_user.id) | 
					
						
							|  |  |  |     pf_id = root_folder["id"] | 
					
						
							|  |  |  |     FileService.init_knowledgebase_docs(pf_id, current_user.id) | 
					
						
							|  |  |  |     kb_root_folder = FileService.get_kb_folder(current_user.id) | 
					
						
							|  |  |  |     kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         filename = duplicate_name( | 
					
						
							|  |  |  |             DocumentService.query, | 
					
						
							|  |  |  |             name=name + ".pdf", | 
					
						
							|  |  |  |             kb_id=kb.id) | 
					
						
							|  |  |  |         filetype = filename_type(filename) | 
					
						
							|  |  |  |         if filetype == FileType.OTHER.value: | 
					
						
							|  |  |  |             raise RuntimeError("This type of file has not been supported yet!") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         location = filename | 
					
						
							| 
									
										
										
										
											2024-09-09 09:41:14 +08:00
										 |  |  |         while STORAGE_IMPL.obj_exist(kb_id, location): | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |             location += "_" | 
					
						
							| 
									
										
										
										
											2024-09-09 09:41:14 +08:00
										 |  |  |         STORAGE_IMPL.put(kb_id, location, blob) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |         doc = { | 
					
						
							|  |  |  |             "id": get_uuid(), | 
					
						
							|  |  |  |             "kb_id": kb.id, | 
					
						
							|  |  |  |             "parser_id": kb.parser_id, | 
					
						
							|  |  |  |             "parser_config": kb.parser_config, | 
					
						
							|  |  |  |             "created_by": current_user.id, | 
					
						
							|  |  |  |             "type": filetype, | 
					
						
							|  |  |  |             "name": filename, | 
					
						
							|  |  |  |             "location": location, | 
					
						
							|  |  |  |             "size": len(blob), | 
					
						
							|  |  |  |             "thumbnail": thumbnail(filename, blob) | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         if doc["type"] == FileType.VISUAL: | 
					
						
							|  |  |  |             doc["parser_id"] = ParserType.PICTURE.value | 
					
						
							|  |  |  |         if doc["type"] == FileType.AURAL: | 
					
						
							|  |  |  |             doc["parser_id"] = ParserType.AUDIO.value | 
					
						
							|  |  |  |         if re.search(r"\.(ppt|pptx|pages)$", filename): | 
					
						
							|  |  |  |             doc["parser_id"] = ParserType.PRESENTATION.value | 
					
						
							| 
									
										
										
										
											2024-09-27 10:29:30 +08:00
										 |  |  |         if re.search(r"\.(eml)$", filename): | 
					
						
							|  |  |  |             doc["parser_id"] = ParserType.EMAIL.value | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |         DocumentService.insert(doc) | 
					
						
							|  |  |  |         FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         return server_error_response(e) | 
					
						
							|  |  |  |     return get_json_result(data=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @manager.route('/create', methods=['POST']) | 
					
						
							|  |  |  | @login_required | 
					
						
							|  |  |  | @validate_request("name", "kb_id") | 
					
						
							|  |  |  | def create(): | 
					
						
							|  |  |  |     req = request.json | 
					
						
							|  |  |  |     kb_id = req["kb_id"] | 
					
						
							|  |  |  |     if not kb_id: | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         e, kb = KnowledgebaseService.get_by_id(kb_id) | 
					
						
							|  |  |  |         if not e: | 
					
						
							|  |  |  |             return get_data_error_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 message="Can't find this knowledgebase!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if DocumentService.query(name=req["name"], kb_id=kb_id): | 
					
						
							|  |  |  |             return get_data_error_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 message="Duplicated document name in the same knowledgebase.") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         doc = DocumentService.insert({ | 
					
						
							|  |  |  |             "id": get_uuid(), | 
					
						
							|  |  |  |             "kb_id": kb.id, | 
					
						
							|  |  |  |             "parser_id": kb.parser_id, | 
					
						
							|  |  |  |             "parser_config": kb.parser_config, | 
					
						
							|  |  |  |             "created_by": current_user.id, | 
					
						
							|  |  |  |             "type": FileType.VIRTUAL, | 
					
						
							|  |  |  |             "name": req["name"], | 
					
						
							|  |  |  |             "location": "", | 
					
						
							|  |  |  |             "size": 0 | 
					
						
							|  |  |  |         }) | 
					
						
							|  |  |  |         return get_json_result(data=doc.to_json()) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         return server_error_response(e) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @manager.route('/list', methods=['GET']) | 
					
						
							|  |  |  | @login_required | 
					
						
							|  |  |  | def list_docs(): | 
					
						
							|  |  |  |     kb_id = request.args.get("kb_id") | 
					
						
							|  |  |  |     if not kb_id: | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR) | 
					
						
							| 
									
										
										
										
											2024-09-04 10:36:15 +08:00
										 |  |  |     tenants = UserTenantService.query(user_id=current_user.id) | 
					
						
							|  |  |  |     for tenant in tenants: | 
					
						
							|  |  |  |         if KnowledgebaseService.query( | 
					
						
							|  |  |  |                 tenant_id=tenant.tenant_id, id=kb_id): | 
					
						
							|  |  |  |             break | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             data=False, message='Only owner of knowledgebase authorized for this operation.', | 
					
						
							|  |  |  |             code=RetCode.OPERATING_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |     keywords = request.args.get("keywords", "") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     page_number = int(request.args.get("page", 1)) | 
					
						
							|  |  |  |     items_per_page = int(request.args.get("page_size", 15)) | 
					
						
							|  |  |  |     orderby = request.args.get("orderby", "create_time") | 
					
						
							|  |  |  |     desc = request.args.get("desc", True) | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         docs, tol = DocumentService.get_by_kb_id( | 
					
						
							|  |  |  |             kb_id, page_number, items_per_page, orderby, desc, keywords) | 
					
						
							| 
									
										
										
										
											2024-10-10 09:09:29 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         for doc_item in docs: | 
					
						
							|  |  |  |             if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX): | 
					
						
							| 
									
										
										
										
											2024-10-10 11:05:03 +08:00
										 |  |  |                 doc_item['thumbnail'] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}" | 
					
						
							| 
									
										
										
										
											2024-10-10 09:09:29 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |         return get_json_result(data={"total": tol, "docs": docs}) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         return server_error_response(e) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-15 09:34:24 +08:00
										 |  |  | @manager.route('/infos', methods=['POST']) | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  | @login_required | 
					
						
							| 
									
										
										
										
											2024-08-15 09:34:24 +08:00
										 |  |  | def docinfos(): | 
					
						
							|  |  |  |     req = request.json | 
					
						
							|  |  |  |     doc_ids = req["doc_ids"] | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  |     for doc_id in doc_ids: | 
					
						
							|  |  |  |         if not DocumentService.accessible(doc_id, current_user.id): | 
					
						
							|  |  |  |             return get_json_result( | 
					
						
							|  |  |  |                 data=False, | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 message='No authorization.', | 
					
						
							|  |  |  |                 code=RetCode.AUTHENTICATION_ERROR | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:34:24 +08:00
										 |  |  |     docs = DocumentService.get_by_ids(doc_ids) | 
					
						
							|  |  |  |     return get_json_result(data=list(docs.dicts())) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | @manager.route('/thumbnails', methods=['GET']) | 
					
						
							| 
									
										
										
										
											2024-08-15 19:30:43 +08:00
										 |  |  | #@login_required | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | def thumbnails(): | 
					
						
							|  |  |  |     doc_ids = request.args.get("doc_ids").split(",") | 
					
						
							|  |  |  |     if not doc_ids: | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             data=False, message='Lack of "Document ID"', code=RetCode.ARGUMENT_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         docs = DocumentService.get_thumbnails(doc_ids) | 
					
						
							| 
									
										
										
										
											2024-10-11 16:10:27 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         for doc_item in docs: | 
					
						
							|  |  |  |             if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX): | 
					
						
							|  |  |  |                 doc_item['thumbnail'] = f"/v1/document/image/{doc_item['kb_id']}-{doc_item['thumbnail']}" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |         return get_json_result(data={d["id"]: d["thumbnail"] for d in docs}) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         return server_error_response(e) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @manager.route('/change_status', methods=['POST']) | 
					
						
							|  |  |  | @login_required | 
					
						
							|  |  |  | @validate_request("doc_id", "status") | 
					
						
							|  |  |  | def change_status(): | 
					
						
							|  |  |  |     req = request.json | 
					
						
							|  |  |  |     if str(req["status"]) not in ["0", "1"]: | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  |         return get_json_result( | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |             data=False, | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             message='"Status" must be either 0 or 1!', | 
					
						
							|  |  |  |             code=RetCode.ARGUMENT_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  |     if not DocumentService.accessible(req["doc_id"], current_user.id): | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							|  |  |  |             data=False, | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             message='No authorization.', | 
					
						
							|  |  |  |             code=RetCode.AUTHENTICATION_ERROR) | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |     try: | 
					
						
							|  |  |  |         e, doc = DocumentService.get_by_id(req["doc_id"]) | 
					
						
							|  |  |  |         if not e: | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             return get_data_error_result(message="Document not found!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |         e, kb = KnowledgebaseService.get_by_id(doc.kb_id) | 
					
						
							|  |  |  |         if not e: | 
					
						
							|  |  |  |             return get_data_error_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 message="Can't find this knowledgebase!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if not DocumentService.update_by_id( | 
					
						
							|  |  |  |                 req["doc_id"], {"status": str(req["status"])}): | 
					
						
							|  |  |  |             return get_data_error_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 message="Database error (Document update)!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-11-12 14:59:41 +08:00
										 |  |  |         status = int(req["status"]) | 
					
						
							|  |  |  |         docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |         return get_json_result(data=True) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         return server_error_response(e) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @manager.route('/rm', methods=['POST']) | 
					
						
							|  |  |  | @login_required | 
					
						
							|  |  |  | @validate_request("doc_id") | 
					
						
							|  |  |  | def rm(): | 
					
						
							|  |  |  |     req = request.json | 
					
						
							|  |  |  |     doc_ids = req["doc_id"] | 
					
						
							|  |  |  |     if isinstance(doc_ids, str): doc_ids = [doc_ids] | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     for doc_id in doc_ids: | 
					
						
							|  |  |  |         if not DocumentService.accessible4deletion(doc_id, current_user.id): | 
					
						
							|  |  |  |             return get_json_result( | 
					
						
							|  |  |  |                 data=False, | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 message='No authorization.', | 
					
						
							|  |  |  |                 code=RetCode.AUTHENTICATION_ERROR | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  |             ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |     root_folder = FileService.get_root_folder(current_user.id) | 
					
						
							|  |  |  |     pf_id = root_folder["id"] | 
					
						
							|  |  |  |     FileService.init_knowledgebase_docs(pf_id, current_user.id) | 
					
						
							|  |  |  |     errors = "" | 
					
						
							|  |  |  |     for doc_id in doc_ids: | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             e, doc = DocumentService.get_by_id(doc_id) | 
					
						
							|  |  |  |             if not e: | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 return get_data_error_result(message="Document not found!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |             tenant_id = DocumentService.get_tenant_id(doc_id) | 
					
						
							|  |  |  |             if not tenant_id: | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 return get_data_error_result(message="Tenant not found!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-19 19:19:27 +08:00
										 |  |  |             b, n = File2DocumentService.get_storage_address(doc_id=doc_id) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |             if not DocumentService.remove_document(doc, tenant_id): | 
					
						
							|  |  |  |                 return get_data_error_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                     message="Database error (Document removal)!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |             f2d = File2DocumentService.get_by_document_id(doc_id) | 
					
						
							|  |  |  |             FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) | 
					
						
							|  |  |  |             File2DocumentService.delete_by_document_id(doc_id) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-09 09:41:14 +08:00
										 |  |  |             STORAGE_IMPL.rm(b, n) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |         except Exception as e: | 
					
						
							|  |  |  |             errors += str(e) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if errors: | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |         return get_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return get_json_result(data=True) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @manager.route('/run', methods=['POST']) | 
					
						
							|  |  |  | @login_required | 
					
						
							|  |  |  | @validate_request("doc_ids", "run") | 
					
						
							|  |  |  | def run(): | 
					
						
							|  |  |  |     req = request.json | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  |     for doc_id in req["doc_ids"]: | 
					
						
							|  |  |  |         if not DocumentService.accessible(doc_id, current_user.id): | 
					
						
							|  |  |  |             return get_json_result( | 
					
						
							|  |  |  |                 data=False, | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 message='No authorization.', | 
					
						
							|  |  |  |                 code=RetCode.AUTHENTICATION_ERROR | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  |             ) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |     try: | 
					
						
							|  |  |  |         for id in req["doc_ids"]: | 
					
						
							|  |  |  |             info = {"run": str(req["run"]), "progress": 0} | 
					
						
							|  |  |  |             if str(req["run"]) == TaskStatus.RUNNING.value: | 
					
						
							|  |  |  |                 info["progress_msg"] = "" | 
					
						
							|  |  |  |                 info["chunk_num"] = 0 | 
					
						
							|  |  |  |                 info["token_num"] = 0 | 
					
						
							|  |  |  |             DocumentService.update_by_id(id, info) | 
					
						
							|  |  |  |             # if str(req["run"]) == TaskStatus.CANCEL.value: | 
					
						
							|  |  |  |             tenant_id = DocumentService.get_tenant_id(id) | 
					
						
							|  |  |  |             if not tenant_id: | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 return get_data_error_result(message="Tenant not found!") | 
					
						
							| 
									
										
										
										
											2024-11-12 14:59:41 +08:00
										 |  |  |             e, doc = DocumentService.get_by_id(id) | 
					
						
							|  |  |  |             if not e: | 
					
						
							|  |  |  |                 return get_data_error_result(message="Document not found!") | 
					
						
							|  |  |  |             if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id): | 
					
						
							|  |  |  |                 docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |             if str(req["run"]) == TaskStatus.RUNNING.value: | 
					
						
							|  |  |  |                 TaskService.filter_delete([Task.doc_id == id]) | 
					
						
							|  |  |  |                 e, doc = DocumentService.get_by_id(id) | 
					
						
							|  |  |  |                 doc = doc.to_dict() | 
					
						
							|  |  |  |                 doc["tenant_id"] = tenant_id | 
					
						
							| 
									
										
										
										
											2024-09-19 19:19:27 +08:00
										 |  |  |                 bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"]) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |                 queue_tasks(doc, bucket, name) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return get_json_result(data=True) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         return server_error_response(e) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @manager.route('/rename', methods=['POST']) | 
					
						
							|  |  |  | @login_required | 
					
						
							|  |  |  | @validate_request("doc_id", "name") | 
					
						
							|  |  |  | def rename(): | 
					
						
							|  |  |  |     req = request.json | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  |     if not DocumentService.accessible(req["doc_id"], current_user.id): | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							|  |  |  |             data=False, | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             message='No authorization.', | 
					
						
							|  |  |  |             code=RetCode.AUTHENTICATION_ERROR | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |     try: | 
					
						
							|  |  |  |         e, doc = DocumentService.get_by_id(req["doc_id"]) | 
					
						
							|  |  |  |         if not e: | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             return get_data_error_result(message="Document not found!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |         if pathlib.Path(req["name"].lower()).suffix != pathlib.Path( | 
					
						
							|  |  |  |                 doc.name.lower()).suffix: | 
					
						
							|  |  |  |             return get_json_result( | 
					
						
							|  |  |  |                 data=False, | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 message="The extension of file can't be changed", | 
					
						
							|  |  |  |                 code=RetCode.ARGUMENT_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |         for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id): | 
					
						
							|  |  |  |             if d.name == req["name"]: | 
					
						
							|  |  |  |                 return get_data_error_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                     message="Duplicated document name in the same knowledgebase.") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if not DocumentService.update_by_id( | 
					
						
							|  |  |  |                 req["doc_id"], {"name": req["name"]}): | 
					
						
							|  |  |  |             return get_data_error_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 message="Database error (Document rename)!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         informs = File2DocumentService.get_by_document_id(req["doc_id"]) | 
					
						
							|  |  |  |         if informs: | 
					
						
							|  |  |  |             e, file = FileService.get_by_id(informs[0].file_id) | 
					
						
							|  |  |  |             FileService.update_by_id(file.id, {"name": req["name"]}) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return get_json_result(data=True) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         return server_error_response(e) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @manager.route('/get/<doc_id>', methods=['GET']) | 
					
						
							|  |  |  | # @login_required | 
					
						
							|  |  |  | def get(doc_id): | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         e, doc = DocumentService.get_by_id(doc_id) | 
					
						
							|  |  |  |         if not e: | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             return get_data_error_result(message="Document not found!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-09-19 19:19:27 +08:00
										 |  |  |         b, n = File2DocumentService.get_storage_address(doc_id=doc_id) | 
					
						
							| 
									
										
										
										
											2024-09-09 09:41:14 +08:00
										 |  |  |         response = flask.make_response(STORAGE_IMPL.get(b, n)) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         ext = re.search(r"\.([^.]+)$", doc.name) | 
					
						
							|  |  |  |         if ext: | 
					
						
							|  |  |  |             if doc.type == FileType.VISUAL.value: | 
					
						
							|  |  |  |                 response.headers.set('Content-Type', 'image/%s' % ext.group(1)) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 response.headers.set( | 
					
						
							|  |  |  |                     'Content-Type', | 
					
						
							|  |  |  |                     'application/%s' % | 
					
						
							|  |  |  |                     ext.group(1)) | 
					
						
							|  |  |  |         return response | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         return server_error_response(e) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @manager.route('/change_parser', methods=['POST']) | 
					
						
							|  |  |  | @login_required | 
					
						
							|  |  |  | @validate_request("doc_id", "parser_id") | 
					
						
							|  |  |  | def change_parser(): | 
					
						
							|  |  |  |     req = request.json | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if not DocumentService.accessible(req["doc_id"], current_user.id): | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							|  |  |  |             data=False, | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             message='No authorization.', | 
					
						
							|  |  |  |             code=RetCode.AUTHENTICATION_ERROR | 
					
						
							| 
									
										
										
										
											2024-10-18 13:48:57 +08:00
										 |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |     try: | 
					
						
							|  |  |  |         e, doc = DocumentService.get_by_id(req["doc_id"]) | 
					
						
							|  |  |  |         if not e: | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             return get_data_error_result(message="Document not found!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |         if doc.parser_id.lower() == req["parser_id"].lower(): | 
					
						
							|  |  |  |             if "parser_config" in req: | 
					
						
							|  |  |  |                 if req["parser_config"] == doc.parser_config: | 
					
						
							|  |  |  |                     return get_json_result(data=True) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 return get_json_result(data=True) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-15 10:11:09 +08:00
										 |  |  |         if ((doc.type == FileType.VISUAL and req["parser_id"] != "picture") | 
					
						
							|  |  |  |                 or (re.search( | 
					
						
							|  |  |  |                     r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")): | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             return get_data_error_result(message="Not supported yet!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         e = DocumentService.update_by_id(doc.id, | 
					
						
							|  |  |  |                                          {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", | 
					
						
							|  |  |  |                                           "run": TaskStatus.UNSTART.value}) | 
					
						
							|  |  |  |         if not e: | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             return get_data_error_result(message="Document not found!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |         if "parser_config" in req: | 
					
						
							|  |  |  |             DocumentService.update_parser_config(doc.id, req["parser_config"]) | 
					
						
							|  |  |  |         if doc.token_num > 0: | 
					
						
							|  |  |  |             e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, | 
					
						
							|  |  |  |                                                     doc.process_duation * -1) | 
					
						
							|  |  |  |             if not e: | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 return get_data_error_result(message="Document not found!") | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |             tenant_id = DocumentService.get_tenant_id(req["doc_id"]) | 
					
						
							|  |  |  |             if not tenant_id: | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 return get_data_error_result(message="Tenant not found!") | 
					
						
							| 
									
										
										
										
											2024-11-12 14:59:41 +08:00
										 |  |  |             if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id): | 
					
						
							|  |  |  |                 docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |         return get_json_result(data=True) | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         return server_error_response(e) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @manager.route('/image/<image_id>', methods=['GET']) | 
					
						
							|  |  |  | # @login_required | 
					
						
							|  |  |  | def get_image(image_id): | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         bkt, nm = image_id.split("-") | 
					
						
							| 
									
										
										
										
											2024-09-09 09:41:14 +08:00
										 |  |  |         response = flask.make_response(STORAGE_IMPL.get(bkt, nm)) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  |         response.headers.set('Content-Type', 'image/JPEG') | 
					
						
							|  |  |  |         return response | 
					
						
							|  |  |  |     except Exception as e: | 
					
						
							|  |  |  |         return server_error_response(e) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @manager.route('/upload_and_parse', methods=['POST']) | 
					
						
							|  |  |  | @login_required | 
					
						
							|  |  |  | @validate_request("conversation_id") | 
					
						
							|  |  |  | def upload_and_parse(): | 
					
						
							|  |  |  |     if 'file' not in request.files: | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |             data=False, message='No file part!', code=RetCode.ARGUMENT_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  |     file_objs = request.files.getlist('file') | 
					
						
							|  |  |  |     for file_obj in file_objs: | 
					
						
							|  |  |  |         if file_obj.filename == '': | 
					
						
							|  |  |  |             return get_json_result( | 
					
						
							| 
									
										
										
										
											2024-11-05 11:02:31 +08:00
										 |  |  |                 data=False, message='No file selected!', code=RetCode.ARGUMENT_ERROR) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-15 19:30:43 +08:00
										 |  |  |     doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id) | 
					
						
							| 
									
										
										
										
											2024-08-15 09:17:36 +08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-15 19:30:43 +08:00
										 |  |  |     return get_json_result(data=doc_ids) | 
					
						
							| 
									
										
										
										
											2024-11-13 12:58:37 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @manager.route('/parse', methods=['POST']) | 
					
						
							|  |  |  | @login_required | 
					
						
							|  |  |  | def parse(): | 
					
						
							| 
									
										
										
										
											2024-11-14 12:29:15 +08:00
										 |  |  |     url = request.json.get("url") if request.json else "" | 
					
						
							| 
									
										
										
										
											2024-11-13 12:58:37 +08:00
										 |  |  |     if url: | 
					
						
							|  |  |  |         if not is_valid_url(url): | 
					
						
							|  |  |  |             return get_json_result( | 
					
						
							|  |  |  |                 data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR) | 
					
						
							|  |  |  |         from selenium.webdriver import Chrome, ChromeOptions | 
					
						
							|  |  |  |         options = ChromeOptions() | 
					
						
							|  |  |  |         options.add_argument('--headless') | 
					
						
							|  |  |  |         options.add_argument('--disable-gpu') | 
					
						
							|  |  |  |         options.add_argument('--no-sandbox') | 
					
						
							|  |  |  |         options.add_argument('--disable-dev-shm-usage') | 
					
						
							|  |  |  |         driver = Chrome(options=options) | 
					
						
							|  |  |  |         driver.get(url) | 
					
						
							| 
									
										
										
										
											2024-11-14 12:29:15 +08:00
										 |  |  |         sections = RAGFlowHtmlParser().parser_txt(driver.page_source) | 
					
						
							| 
									
										
										
										
											2024-11-13 12:58:37 +08:00
										 |  |  |         return get_json_result(data="\n".join(sections)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if 'file' not in request.files: | 
					
						
							|  |  |  |         return get_json_result( | 
					
						
							|  |  |  |             data=False, message='No file part!', code=RetCode.ARGUMENT_ERROR) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     file_objs = request.files.getlist('file') | 
					
						
							|  |  |  |     txt = FileService.parse_docs(file_objs, current_user.id) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return get_json_result(data=txt) |