2024-08-15 09:17:36 +08:00
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
#
2025-01-10 19:06:59 +08:00
import json
2024-11-15 15:41:50 +08:00
import os . path
2024-08-15 09:17:36 +08:00
import pathlib
import re
import flask
from flask import request
2025-05-19 14:54:06 +08:00
from flask_login import current_user , login_required
2024-08-15 09:17:36 +08:00
2025-05-19 14:54:06 +08:00
from api import settings
2025-06-17 18:01:30 +08:00
from api . constants import FILE_NAME_LEN_LIMIT , IMG_BASE64_PREFIX
2025-05-19 14:54:06 +08:00
from api . db import VALID_FILE_TYPES , VALID_TASK_STATUS , FileSource , FileType , ParserType , TaskStatus
2024-12-12 16:38:03 +08:00
from api . db . db_models import File , Task
2025-05-19 14:54:06 +08:00
from api . db . services import duplicate_name
from api . db . services . document_service import DocumentService , doc_upload_and_parse
2024-08-15 09:17:36 +08:00
from api . db . services . file2document_service import File2DocumentService
from api . db . services . file_service import FileService
from api . db . services . knowledgebase_service import KnowledgebaseService
2025-05-19 14:54:06 +08:00
from api . db . services . task_service import TaskService , queue_tasks
from api . db . services . user_service import UserTenantService
from api . utils import get_uuid
2024-12-12 16:38:03 +08:00
from api . utils . api_utils import (
get_data_error_result ,
2025-05-19 14:54:06 +08:00
get_json_result ,
server_error_response ,
2024-12-12 16:38:03 +08:00
validate_request ,
)
2025-05-19 14:54:06 +08:00
from api . utils . file_utils import filename_type , get_project_base_directory , thumbnail
2024-08-15 09:17:36 +08:00
from api . utils . web_utils import html2pdf , is_valid_url
2025-05-19 14:54:06 +08:00
from deepdoc . parser . html_parser import RAGFlowHtmlParser
from rag . nlp import search
from rag . utils . storage_factory import STORAGE_IMPL
2024-08-15 09:17:36 +08:00
2025-05-19 14:54:06 +08:00
@manager.route ( " /upload " , methods = [ " POST " ] ) # noqa: F821
2024-08-15 09:17:36 +08:00
@login_required
@validate_request ( " kb_id " )
def upload ( ) :
kb_id = request . form . get ( " kb_id " )
if not kb_id :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = ' Lack of " KB ID " ' , code = settings . RetCode . ARGUMENT_ERROR )
if " file " not in request . files :
return get_json_result ( data = False , message = " No file part! " , code = settings . RetCode . ARGUMENT_ERROR )
2024-08-15 09:17:36 +08:00
2025-05-19 14:54:06 +08:00
file_objs = request . files . getlist ( " file " )
2024-08-15 09:17:36 +08:00
for file_obj in file_objs :
2025-05-19 14:54:06 +08:00
if file_obj . filename == " " :
return get_json_result ( data = False , message = " No file selected! " , code = settings . RetCode . ARGUMENT_ERROR )
2025-06-17 18:01:30 +08:00
if len ( file_obj . filename . encode ( " utf-8 " ) ) > FILE_NAME_LEN_LIMIT :
return get_json_result ( data = False , message = f " File name must be { FILE_NAME_LEN_LIMIT } bytes or less. " , code = settings . RetCode . ARGUMENT_ERROR )
2024-08-15 09:17:36 +08:00
e , kb = KnowledgebaseService . get_by_id ( kb_id )
if not e :
raise LookupError ( " Can ' t find this knowledgebase! " )
2025-03-05 02:20:32 -05:00
err , files = FileService . upload_document ( kb , file_objs , current_user . id )
2025-05-19 14:54:06 +08:00
2025-06-13 17:32:40 +08:00
if err :
return get_json_result ( data = files , message = " \n " . join ( err ) , code = settings . RetCode . SERVER_ERROR )
2025-05-19 14:54:06 +08:00
if not files :
return get_json_result ( data = files , message = " There seems to be an issue with your file format. Please verify it is correct and not corrupted. " , code = settings . RetCode . DATA_ERROR )
files = [ f [ 0 ] for f in files ] # remove the blob
2025-03-05 02:20:32 -05:00
return get_json_result ( data = files )
2024-08-15 09:17:36 +08:00
2025-05-19 14:54:06 +08:00
@manager.route ( " /web_crawl " , methods = [ " POST " ] ) # noqa: F821
2024-08-15 09:17:36 +08:00
@login_required
@validate_request ( " kb_id " , " name " , " url " )
def web_crawl ( ) :
kb_id = request . form . get ( " kb_id " )
if not kb_id :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = ' Lack of " KB ID " ' , code = settings . RetCode . ARGUMENT_ERROR )
2024-08-15 09:17:36 +08:00
name = request . form . get ( " name " )
url = request . form . get ( " url " )
if not is_valid_url ( url ) :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = " The URL format is invalid " , code = settings . RetCode . ARGUMENT_ERROR )
2024-08-15 09:17:36 +08:00
e , kb = KnowledgebaseService . get_by_id ( kb_id )
if not e :
raise LookupError ( " Can ' t find this knowledgebase! " )
blob = html2pdf ( url )
2024-12-08 14:21:12 +08:00
if not blob :
return server_error_response ( ValueError ( " Download failure. " ) )
2024-08-15 09:17:36 +08:00
root_folder = FileService . get_root_folder ( current_user . id )
pf_id = root_folder [ " id " ]
FileService . init_knowledgebase_docs ( pf_id , current_user . id )
kb_root_folder = FileService . get_kb_folder ( current_user . id )
kb_folder = FileService . new_a_file_from_kb ( kb . tenant_id , kb . name , kb_root_folder [ " id " ] )
try :
2025-05-19 14:54:06 +08:00
filename = duplicate_name ( DocumentService . query , name = name + " .pdf " , kb_id = kb . id )
2024-08-15 09:17:36 +08:00
filetype = filename_type ( filename )
if filetype == FileType . OTHER . value :
raise RuntimeError ( " This type of file has not been supported yet! " )
location = filename
2024-09-09 09:41:14 +08:00
while STORAGE_IMPL . obj_exist ( kb_id , location ) :
2024-08-15 09:17:36 +08:00
location + = " _ "
2024-09-09 09:41:14 +08:00
STORAGE_IMPL . put ( kb_id , location , blob )
2024-08-15 09:17:36 +08:00
doc = {
" id " : get_uuid ( ) ,
" kb_id " : kb . id ,
" parser_id " : kb . parser_id ,
" parser_config " : kb . parser_config ,
" created_by " : current_user . id ,
" type " : filetype ,
" name " : filename ,
" location " : location ,
" size " : len ( blob ) ,
2025-05-19 14:54:06 +08:00
" thumbnail " : thumbnail ( filename , blob ) ,
2024-08-15 09:17:36 +08:00
}
if doc [ " type " ] == FileType . VISUAL :
doc [ " parser_id " ] = ParserType . PICTURE . value
if doc [ " type " ] == FileType . AURAL :
doc [ " parser_id " ] = ParserType . AUDIO . value
if re . search ( r " \ .(ppt|pptx|pages)$ " , filename ) :
doc [ " parser_id " ] = ParserType . PRESENTATION . value
2024-09-27 10:29:30 +08:00
if re . search ( r " \ .(eml)$ " , filename ) :
doc [ " parser_id " ] = ParserType . EMAIL . value
2024-08-15 09:17:36 +08:00
DocumentService . insert ( doc )
FileService . add_file_from_kb ( doc , kb_folder [ " id " ] , kb . tenant_id )
except Exception as e :
return server_error_response ( e )
return get_json_result ( data = True )
2025-05-19 14:54:06 +08:00
@manager.route ( " /create " , methods = [ " POST " ] ) # noqa: F821
2024-08-15 09:17:36 +08:00
@login_required
@validate_request ( " name " , " kb_id " )
def create ( ) :
req = request . json
kb_id = req [ " kb_id " ]
if not kb_id :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = ' Lack of " KB ID " ' , code = settings . RetCode . ARGUMENT_ERROR )
2025-06-17 18:01:30 +08:00
if len ( req [ " name " ] . encode ( " utf-8 " ) ) > FILE_NAME_LEN_LIMIT :
return get_json_result ( data = False , message = f " File name must be { FILE_NAME_LEN_LIMIT } bytes or less. " , code = settings . RetCode . ARGUMENT_ERROR )
2025-06-17 15:53:41 +08:00
if req [ " name " ] . strip ( ) == " " :
return get_json_result ( data = False , message = " File name can ' t be empty. " , code = settings . RetCode . ARGUMENT_ERROR )
req [ " name " ] = req [ " name " ] . strip ( )
2024-08-15 09:17:36 +08:00
try :
e , kb = KnowledgebaseService . get_by_id ( kb_id )
if not e :
2025-05-19 14:54:06 +08:00
return get_data_error_result ( message = " Can ' t find this knowledgebase! " )
2024-08-15 09:17:36 +08:00
if DocumentService . query ( name = req [ " name " ] , kb_id = kb_id ) :
2025-05-19 14:54:06 +08:00
return get_data_error_result ( message = " Duplicated document name in the same knowledgebase. " )
doc = DocumentService . insert (
{
" id " : get_uuid ( ) ,
" kb_id " : kb . id ,
" parser_id " : kb . parser_id ,
" parser_config " : kb . parser_config ,
" created_by " : current_user . id ,
" type " : FileType . VIRTUAL ,
" name " : req [ " name " ] ,
" location " : " " ,
" size " : 0 ,
}
)
2024-08-15 09:17:36 +08:00
return get_json_result ( data = doc . to_json ( ) )
except Exception as e :
return server_error_response ( e )
2025-05-19 14:54:06 +08:00
@manager.route ( " /list " , methods = [ " POST " ] ) # noqa: F821
2024-08-15 09:17:36 +08:00
@login_required
def list_docs ( ) :
kb_id = request . args . get ( " kb_id " )
if not kb_id :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = ' Lack of " KB ID " ' , code = settings . RetCode . ARGUMENT_ERROR )
2024-09-04 10:36:15 +08:00
tenants = UserTenantService . query ( user_id = current_user . id )
for tenant in tenants :
2025-05-19 14:54:06 +08:00
if KnowledgebaseService . query ( tenant_id = tenant . tenant_id , id = kb_id ) :
2024-09-04 10:36:15 +08:00
break
else :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = " Only owner of knowledgebase authorized for this operation. " , code = settings . RetCode . OPERATING_ERROR )
2024-08-15 09:17:36 +08:00
keywords = request . args . get ( " keywords " , " " )
2025-04-27 16:48:27 +08:00
page_number = int ( request . args . get ( " page " , 0 ) )
items_per_page = int ( request . args . get ( " page_size " , 0 ) )
2024-08-15 09:17:36 +08:00
orderby = request . args . get ( " orderby " , " create_time " )
2025-06-19 14:22:56 +08:00
if request . args . get ( " desc " , " true " ) . lower ( ) == " false " :
desc = False
else :
desc = True
2025-04-27 16:48:27 +08:00
req = request . get_json ( )
run_status = req . get ( " run_status " , [ ] )
if run_status :
invalid_status = { s for s in run_status if s not in VALID_TASK_STATUS }
if invalid_status :
2025-05-19 14:54:06 +08:00
return get_data_error_result ( message = f " Invalid filter run status conditions: { ' , ' . join ( invalid_status ) } " )
2025-04-27 16:48:27 +08:00
types = req . get ( " types " , [ ] )
if types :
invalid_types = { t for t in types if t not in VALID_FILE_TYPES }
if invalid_types :
2025-05-19 14:54:06 +08:00
return get_data_error_result ( message = f " Invalid filter conditions: { ' , ' . join ( invalid_types ) } type { ' s ' if len ( invalid_types ) > 1 else ' ' } " )
2025-04-27 16:48:27 +08:00
2024-08-15 09:17:36 +08:00
try :
2025-05-19 14:54:06 +08:00
docs , tol = DocumentService . get_by_kb_id ( kb_id , page_number , items_per_page , orderby , desc , keywords , run_status , types )
2024-10-10 09:09:29 +08:00
for doc_item in docs :
2025-05-19 14:54:06 +08:00
if doc_item [ " thumbnail " ] and not doc_item [ " thumbnail " ] . startswith ( IMG_BASE64_PREFIX ) :
doc_item [ " thumbnail " ] = f " /v1/document/image/ { kb_id } - { doc_item [ ' thumbnail ' ] } "
2024-10-10 09:09:29 +08:00
2024-08-15 09:17:36 +08:00
return get_json_result ( data = { " total " : tol , " docs " : docs } )
except Exception as e :
return server_error_response ( e )
2025-05-19 14:54:06 +08:00
@manager.route ( " /infos " , methods = [ " POST " ] ) # noqa: F821
2024-10-18 13:48:57 +08:00
@login_required
2024-08-15 09:34:24 +08:00
def docinfos ( ) :
req = request . json
doc_ids = req [ " doc_ids " ]
2024-10-18 13:48:57 +08:00
for doc_id in doc_ids :
if not DocumentService . accessible ( doc_id , current_user . id ) :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = " No authorization. " , code = settings . RetCode . AUTHENTICATION_ERROR )
2024-08-15 09:34:24 +08:00
docs = DocumentService . get_by_ids ( doc_ids )
return get_json_result ( data = list ( docs . dicts ( ) ) )
2025-05-19 14:54:06 +08:00
@manager.route ( " /thumbnails " , methods = [ " GET " ] ) # noqa: F821
2024-11-15 17:30:56 +08:00
# @login_required
2024-08-15 09:17:36 +08:00
def thumbnails ( ) :
doc_ids = request . args . get ( " doc_ids " ) . split ( " , " )
if not doc_ids :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = ' Lack of " Document ID " ' , code = settings . RetCode . ARGUMENT_ERROR )
2024-08-15 09:17:36 +08:00
try :
docs = DocumentService . get_thumbnails ( doc_ids )
2024-10-11 16:10:27 +08:00
for doc_item in docs :
2025-05-19 14:54:06 +08:00
if doc_item [ " thumbnail " ] and not doc_item [ " thumbnail " ] . startswith ( IMG_BASE64_PREFIX ) :
doc_item [ " thumbnail " ] = f " /v1/document/image/ { doc_item [ ' kb_id ' ] } - { doc_item [ ' thumbnail ' ] } "
2024-10-11 16:10:27 +08:00
2024-08-15 09:17:36 +08:00
return get_json_result ( data = { d [ " id " ] : d [ " thumbnail " ] for d in docs } )
except Exception as e :
return server_error_response ( e )
2025-05-19 14:54:06 +08:00
@manager.route ( " /change_status " , methods = [ " POST " ] ) # noqa: F821
2024-08-15 09:17:36 +08:00
@login_required
@validate_request ( " doc_id " , " status " )
def change_status ( ) :
req = request . json
if str ( req [ " status " ] ) not in [ " 0 " , " 1 " ] :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = ' " Status " must be either 0 or 1! ' , code = settings . RetCode . ARGUMENT_ERROR )
2024-08-15 09:17:36 +08:00
2024-10-18 13:48:57 +08:00
if not DocumentService . accessible ( req [ " doc_id " ] , current_user . id ) :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = " No authorization. " , code = settings . RetCode . AUTHENTICATION_ERROR )
2024-10-18 13:48:57 +08:00
2024-08-15 09:17:36 +08:00
try :
e , doc = DocumentService . get_by_id ( req [ " doc_id " ] )
if not e :
2024-11-05 11:02:31 +08:00
return get_data_error_result ( message = " Document not found! " )
2024-08-15 09:17:36 +08:00
e , kb = KnowledgebaseService . get_by_id ( doc . kb_id )
if not e :
2025-05-19 14:54:06 +08:00
return get_data_error_result ( message = " Can ' t find this knowledgebase! " )
2024-08-15 09:17:36 +08:00
2025-05-19 14:54:06 +08:00
if not DocumentService . update_by_id ( req [ " doc_id " ] , { " status " : str ( req [ " status " ] ) } ) :
return get_data_error_result ( message = " Database error (Document update)! " )
2024-08-15 09:17:36 +08:00
2024-11-12 14:59:41 +08:00
status = int ( req [ " status " ] )
2025-05-19 14:54:06 +08:00
settings . docStoreConn . update ( { " doc_id " : req [ " doc_id " ] } , { " available_int " : status } , search . index_name ( kb . tenant_id ) , doc . kb_id )
2024-08-15 09:17:36 +08:00
return get_json_result ( data = True )
except Exception as e :
return server_error_response ( e )
2025-05-19 14:54:06 +08:00
@manager.route ( " /rm " , methods = [ " POST " ] ) # noqa: F821
2024-08-15 09:17:36 +08:00
@login_required
@validate_request ( " doc_id " )
def rm ( ) :
req = request . json
doc_ids = req [ " doc_id " ]
2024-12-08 14:21:12 +08:00
if isinstance ( doc_ids , str ) :
doc_ids = [ doc_ids ]
2024-10-18 13:48:57 +08:00
for doc_id in doc_ids :
if not DocumentService . accessible4deletion ( doc_id , current_user . id ) :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = " No authorization. " , code = settings . RetCode . AUTHENTICATION_ERROR )
2024-10-18 13:48:57 +08:00
2024-08-15 09:17:36 +08:00
root_folder = FileService . get_root_folder ( current_user . id )
pf_id = root_folder [ " id " ]
FileService . init_knowledgebase_docs ( pf_id , current_user . id )
errors = " "
2025-05-06 09:44:38 +08:00
kb_table_num_map = { }
2024-08-15 09:17:36 +08:00
for doc_id in doc_ids :
try :
e , doc = DocumentService . get_by_id ( doc_id )
if not e :
2024-11-05 11:02:31 +08:00
return get_data_error_result ( message = " Document not found! " )
2024-08-15 09:17:36 +08:00
tenant_id = DocumentService . get_tenant_id ( doc_id )
if not tenant_id :
2024-11-05 11:02:31 +08:00
return get_data_error_result ( message = " Tenant not found! " )
2024-08-15 09:17:36 +08:00
2024-09-19 19:19:27 +08:00
b , n = File2DocumentService . get_storage_address ( doc_id = doc_id )
2024-08-15 09:17:36 +08:00
2024-12-12 16:38:03 +08:00
TaskService . filter_delete ( [ Task . doc_id == doc_id ] )
2024-08-15 09:17:36 +08:00
if not DocumentService . remove_document ( doc , tenant_id ) :
2025-05-19 14:54:06 +08:00
return get_data_error_result ( message = " Database error (Document removal)! " )
2024-08-15 09:17:36 +08:00
f2d = File2DocumentService . get_by_document_id ( doc_id )
2025-04-27 16:48:27 +08:00
deleted_file_count = 0
if f2d :
deleted_file_count = FileService . filter_delete ( [ File . source_type == FileSource . KNOWLEDGEBASE , File . id == f2d [ 0 ] . file_id ] )
2024-08-15 09:17:36 +08:00
File2DocumentService . delete_by_document_id ( doc_id )
2025-04-15 12:11:41 +08:00
if deleted_file_count > 0 :
STORAGE_IMPL . rm ( b , n )
2025-05-06 09:44:38 +08:00
doc_parser = doc . parser_id
if doc_parser == ParserType . TABLE :
kb_id = doc . kb_id
if kb_id not in kb_table_num_map :
counts = DocumentService . count_by_kb_id ( kb_id = kb_id , keywords = " " , run_status = [ TaskStatus . DONE ] , types = [ ] )
kb_table_num_map [ kb_id ] = counts
kb_table_num_map [ kb_id ] - = 1
if kb_table_num_map [ kb_id ] < = 0 :
KnowledgebaseService . delete_field_map ( kb_id )
2024-08-15 09:17:36 +08:00
except Exception as e :
errors + = str ( e )
if errors :
2024-11-15 17:30:56 +08:00
return get_json_result ( data = False , message = errors , code = settings . RetCode . SERVER_ERROR )
2024-08-15 09:17:36 +08:00
return get_json_result ( data = True )
2025-05-19 14:54:06 +08:00
@manager.route ( " /run " , methods = [ " POST " ] ) # noqa: F821
2024-08-15 09:17:36 +08:00
@login_required
@validate_request ( " doc_ids " , " run " )
2025-05-19 14:54:06 +08:00
def run ( ) :
2024-08-15 09:17:36 +08:00
req = request . json
2024-10-18 13:48:57 +08:00
for doc_id in req [ " doc_ids " ] :
if not DocumentService . accessible ( doc_id , current_user . id ) :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = " No authorization. " , code = settings . RetCode . AUTHENTICATION_ERROR )
2024-08-15 09:17:36 +08:00
try :
2025-05-06 09:44:38 +08:00
kb_table_num_map = { }
2024-08-15 09:17:36 +08:00
for id in req [ " doc_ids " ] :
info = { " run " : str ( req [ " run " ] ) , " progress " : 0 }
2024-12-16 15:23:49 +08:00
if str ( req [ " run " ] ) == TaskStatus . RUNNING . value and req . get ( " delete " , False ) :
2024-08-15 09:17:36 +08:00
info [ " progress_msg " ] = " "
info [ " chunk_num " ] = 0
info [ " token_num " ] = 0
2025-06-26 17:46:53 +08:00
e , doc = DocumentService . get_by_id ( id )
if not e :
return get_data_error_result ( message = " Document not found! " )
if doc . run == TaskStatus . DONE . value :
DocumentService . clear_chunk_num_when_rerun ( doc . id )
2024-08-15 09:17:36 +08:00
DocumentService . update_by_id ( id , info )
tenant_id = DocumentService . get_tenant_id ( id )
if not tenant_id :
2024-11-05 11:02:31 +08:00
return get_data_error_result ( message = " Tenant not found! " )
2024-11-12 14:59:41 +08:00
e , doc = DocumentService . get_by_id ( id )
if not e :
return get_data_error_result ( message = " Document not found! " )
2024-12-12 16:38:03 +08:00
if req . get ( " delete " , False ) :
TaskService . filter_delete ( [ Task . doc_id == id ] )
if settings . docStoreConn . indexExist ( search . index_name ( tenant_id ) , doc . kb_id ) :
settings . docStoreConn . delete ( { " doc_id " : id } , search . index_name ( tenant_id ) , doc . kb_id )
2024-08-15 09:17:36 +08:00
if str ( req [ " run " ] ) == TaskStatus . RUNNING . value :
e , doc = DocumentService . get_by_id ( id )
doc = doc . to_dict ( )
doc [ " tenant_id " ] = tenant_id
2025-05-06 09:44:38 +08:00
doc_parser = doc . get ( " parser_id " , ParserType . NAIVE )
if doc_parser == ParserType . TABLE :
kb_id = doc . get ( " kb_id " )
if not kb_id :
continue
if kb_id not in kb_table_num_map :
count = DocumentService . count_by_kb_id ( kb_id = kb_id , keywords = " " , run_status = [ TaskStatus . DONE ] , types = [ ] )
kb_table_num_map [ kb_id ] = count
2025-05-19 14:54:06 +08:00
if kb_table_num_map [ kb_id ] < = 0 :
2025-05-06 09:44:38 +08:00
KnowledgebaseService . delete_field_map ( kb_id )
2024-09-19 19:19:27 +08:00
bucket , name = File2DocumentService . get_storage_address ( doc_id = doc [ " id " ] )
2025-03-14 23:43:46 +08:00
queue_tasks ( doc , bucket , name , 0 )
2024-08-15 09:17:36 +08:00
return get_json_result ( data = True )
except Exception as e :
return server_error_response ( e )
2025-05-19 14:54:06 +08:00
@manager.route ( " /rename " , methods = [ " POST " ] ) # noqa: F821
2024-08-15 09:17:36 +08:00
@login_required
@validate_request ( " doc_id " , " name " )
def rename ( ) :
req = request . json
2024-10-18 13:48:57 +08:00
if not DocumentService . accessible ( req [ " doc_id " ] , current_user . id ) :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = " No authorization. " , code = settings . RetCode . AUTHENTICATION_ERROR )
2024-08-15 09:17:36 +08:00
try :
e , doc = DocumentService . get_by_id ( req [ " doc_id " ] )
if not e :
2024-11-05 11:02:31 +08:00
return get_data_error_result ( message = " Document not found! " )
2025-05-19 14:54:06 +08:00
if pathlib . Path ( req [ " name " ] . lower ( ) ) . suffix != pathlib . Path ( doc . name . lower ( ) ) . suffix :
return get_json_result ( data = False , message = " The extension of file can ' t be changed " , code = settings . RetCode . ARGUMENT_ERROR )
2025-06-17 18:01:30 +08:00
if len ( req [ " name " ] . encode ( " utf-8 " ) ) > FILE_NAME_LEN_LIMIT :
return get_json_result ( data = False , message = f " File name must be { FILE_NAME_LEN_LIMIT } bytes or less. " , code = settings . RetCode . ARGUMENT_ERROR )
2025-06-16 16:39:41 +08:00
2024-08-15 09:17:36 +08:00
for d in DocumentService . query ( name = req [ " name " ] , kb_id = doc . kb_id ) :
if d . name == req [ " name " ] :
2025-05-19 14:54:06 +08:00
return get_data_error_result ( message = " Duplicated document name in the same knowledgebase. " )
2024-08-15 09:17:36 +08:00
2025-05-19 14:54:06 +08:00
if not DocumentService . update_by_id ( req [ " doc_id " ] , { " name " : req [ " name " ] } ) :
return get_data_error_result ( message = " Database error (Document rename)! " )
2024-08-15 09:17:36 +08:00
informs = File2DocumentService . get_by_document_id ( req [ " doc_id " ] )
if informs :
e , file = FileService . get_by_id ( informs [ 0 ] . file_id )
FileService . update_by_id ( file . id , { " name " : req [ " name " ] } )
return get_json_result ( data = True )
except Exception as e :
return server_error_response ( e )
2025-05-19 14:54:06 +08:00
@manager.route ( " /get/<doc_id> " , methods = [ " GET " ] ) # noqa: F821
2024-08-15 09:17:36 +08:00
# @login_required
def get ( doc_id ) :
try :
e , doc = DocumentService . get_by_id ( doc_id )
if not e :
2024-11-05 11:02:31 +08:00
return get_data_error_result ( message = " Document not found! " )
2024-08-15 09:17:36 +08:00
2024-09-19 19:19:27 +08:00
b , n = File2DocumentService . get_storage_address ( doc_id = doc_id )
2024-09-09 09:41:14 +08:00
response = flask . make_response ( STORAGE_IMPL . get ( b , n ) )
2024-08-15 09:17:36 +08:00
ext = re . search ( r " \ .([^.]+)$ " , doc . name )
if ext :
if doc . type == FileType . VISUAL . value :
2025-05-19 14:54:06 +08:00
response . headers . set ( " Content-Type " , " image/ %s " % ext . group ( 1 ) )
2024-08-15 09:17:36 +08:00
else :
2025-05-19 14:54:06 +08:00
response . headers . set ( " Content-Type " , " application/ %s " % ext . group ( 1 ) )
2024-08-15 09:17:36 +08:00
return response
except Exception as e :
return server_error_response ( e )
2025-05-19 14:54:06 +08:00
@manager.route ( " /change_parser " , methods = [ " POST " ] ) # noqa: F821
2024-08-15 09:17:36 +08:00
@login_required
@validate_request ( " doc_id " , " parser_id " )
def change_parser ( ) :
req = request . json
2024-10-18 13:48:57 +08:00
if not DocumentService . accessible ( req [ " doc_id " ] , current_user . id ) :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = " No authorization. " , code = settings . RetCode . AUTHENTICATION_ERROR )
2024-08-15 09:17:36 +08:00
try :
e , doc = DocumentService . get_by_id ( req [ " doc_id " ] )
if not e :
2024-11-05 11:02:31 +08:00
return get_data_error_result ( message = " Document not found! " )
2024-08-15 09:17:36 +08:00
if doc . parser_id . lower ( ) == req [ " parser_id " ] . lower ( ) :
if " parser_config " in req :
if req [ " parser_config " ] == doc . parser_config :
return get_json_result ( data = True )
else :
return get_json_result ( data = True )
2025-05-19 14:54:06 +08:00
if ( doc . type == FileType . VISUAL and req [ " parser_id " ] != " picture " ) or ( re . search ( r " \ .(ppt|pptx|pages)$ " , doc . name ) and req [ " parser_id " ] != " presentation " ) :
2024-11-05 11:02:31 +08:00
return get_data_error_result ( message = " Not supported yet! " )
2024-08-15 09:17:36 +08:00
2025-05-19 14:54:06 +08:00
e = DocumentService . update_by_id ( doc . id , { " parser_id " : req [ " parser_id " ] , " progress " : 0 , " progress_msg " : " " , " run " : TaskStatus . UNSTART . value } )
2024-08-15 09:17:36 +08:00
if not e :
2024-11-05 11:02:31 +08:00
return get_data_error_result ( message = " Document not found! " )
2024-08-15 09:17:36 +08:00
if " parser_config " in req :
DocumentService . update_parser_config ( doc . id , req [ " parser_config " ] )
if doc . token_num > 0 :
2025-05-19 14:54:06 +08:00
e = DocumentService . increment_chunk_num ( doc . id , doc . kb_id , doc . token_num * - 1 , doc . chunk_num * - 1 , doc . process_duation * - 1 )
2024-08-15 09:17:36 +08:00
if not e :
2024-11-05 11:02:31 +08:00
return get_data_error_result ( message = " Document not found! " )
2024-08-15 09:17:36 +08:00
tenant_id = DocumentService . get_tenant_id ( req [ " doc_id " ] )
if not tenant_id :
2024-11-05 11:02:31 +08:00
return get_data_error_result ( message = " Tenant not found! " )
2024-11-15 17:30:56 +08:00
if settings . docStoreConn . indexExist ( search . index_name ( tenant_id ) , doc . kb_id ) :
settings . docStoreConn . delete ( { " doc_id " : doc . id } , search . index_name ( tenant_id ) , doc . kb_id )
2024-08-15 09:17:36 +08:00
return get_json_result ( data = True )
except Exception as e :
return server_error_response ( e )
2025-05-19 14:54:06 +08:00
@manager.route ( " /image/<image_id> " , methods = [ " GET " ] ) # noqa: F821
2024-08-15 09:17:36 +08:00
# @login_required
def get_image ( image_id ) :
try :
2024-12-09 15:24:58 +08:00
arr = image_id . split ( " - " )
if len ( arr ) != 2 :
return get_data_error_result ( message = " Image not found. " )
2024-08-15 09:17:36 +08:00
bkt , nm = image_id . split ( " - " )
2024-09-09 09:41:14 +08:00
response = flask . make_response ( STORAGE_IMPL . get ( bkt , nm ) )
2025-05-19 14:54:06 +08:00
response . headers . set ( " Content-Type " , " image/JPEG " )
2024-08-15 09:17:36 +08:00
return response
except Exception as e :
return server_error_response ( e )
2025-05-19 14:54:06 +08:00
@manager.route ( " /upload_and_parse " , methods = [ " POST " ] ) # noqa: F821
2024-08-15 09:17:36 +08:00
@login_required
@validate_request ( " conversation_id " )
def upload_and_parse ( ) :
2025-05-19 14:54:06 +08:00
if " file " not in request . files :
return get_json_result ( data = False , message = " No file part! " , code = settings . RetCode . ARGUMENT_ERROR )
2024-08-15 09:17:36 +08:00
2025-05-19 14:54:06 +08:00
file_objs = request . files . getlist ( " file " )
2024-08-15 09:17:36 +08:00
for file_obj in file_objs :
2025-05-19 14:54:06 +08:00
if file_obj . filename == " " :
return get_json_result ( data = False , message = " No file selected! " , code = settings . RetCode . ARGUMENT_ERROR )
2024-08-15 09:17:36 +08:00
2024-08-15 19:30:43 +08:00
doc_ids = doc_upload_and_parse ( request . form . get ( " conversation_id " ) , file_objs , current_user . id )
2024-08-15 09:17:36 +08:00
2024-08-15 19:30:43 +08:00
return get_json_result ( data = doc_ids )
2024-11-13 12:58:37 +08:00
2025-05-19 14:54:06 +08:00
@manager.route ( " /parse " , methods = [ " POST " ] ) # noqa: F821
2024-11-13 12:58:37 +08:00
@login_required
def parse ( ) :
2024-11-14 12:29:15 +08:00
url = request . json . get ( " url " ) if request . json else " "
2024-11-13 12:58:37 +08:00
if url :
if not is_valid_url ( url ) :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = " The URL format is invalid " , code = settings . RetCode . ARGUMENT_ERROR )
2024-11-15 15:41:50 +08:00
download_path = os . path . join ( get_project_base_directory ( ) , " logs/downloads " )
os . makedirs ( download_path , exist_ok = True )
2024-11-18 10:15:26 +08:00
from seleniumwire . webdriver import Chrome , ChromeOptions
2025-05-19 14:54:06 +08:00
2024-11-13 12:58:37 +08:00
options = ChromeOptions ( )
2025-05-19 14:54:06 +08:00
options . add_argument ( " --headless " )
options . add_argument ( " --disable-gpu " )
options . add_argument ( " --no-sandbox " )
options . add_argument ( " --disable-dev-shm-usage " )
options . add_experimental_option ( " prefs " , { " download.default_directory " : download_path , " download.prompt_for_download " : False , " download.directory_upgrade " : True , " safebrowsing.enabled " : True } )
2024-11-13 12:58:37 +08:00
driver = Chrome ( options = options )
driver . get ( url )
2024-12-11 19:23:59 +08:00
res_headers = [ r . response . headers for r in driver . requests if r and r . response ]
2024-11-18 10:15:26 +08:00
if len ( res_headers ) > 1 :
sections = RAGFlowHtmlParser ( ) . parser_txt ( driver . page_source )
driver . quit ( )
return get_json_result ( data = " \n " . join ( sections ) )
class File :
filename : str
filepath : str
def __init__ ( self , filename , filepath ) :
self . filename = filename
self . filepath = filepath
def read ( self ) :
2024-11-19 18:41:48 +08:00
with open ( self . filepath , " rb " ) as f :
2024-11-18 10:15:26 +08:00
return f . read ( )
2024-11-19 18:41:48 +08:00
r = re . search ( r " filename= \" ([^ \" ]+) \" " , str ( res_headers ) )
if not r or not r . group ( 1 ) :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = " Can ' t not identify downloaded file " , code = settings . RetCode . ARGUMENT_ERROR )
2024-11-18 10:15:26 +08:00
f = File ( r . group ( 1 ) , os . path . join ( download_path , r . group ( 1 ) ) )
txt = FileService . parse_docs ( [ f ] , current_user . id )
return get_json_result ( data = txt )
2024-11-13 12:58:37 +08:00
2025-05-19 14:54:06 +08:00
if " file " not in request . files :
return get_json_result ( data = False , message = " No file part! " , code = settings . RetCode . ARGUMENT_ERROR )
2024-11-13 12:58:37 +08:00
2025-05-19 14:54:06 +08:00
file_objs = request . files . getlist ( " file " )
2024-11-13 12:58:37 +08:00
txt = FileService . parse_docs ( file_objs , current_user . id )
return get_json_result ( data = txt )
2025-01-10 19:06:59 +08:00
2025-05-19 14:54:06 +08:00
@manager.route ( " /set_meta " , methods = [ " POST " ] ) # noqa: F821
2025-01-10 19:06:59 +08:00
@login_required
@validate_request ( " doc_id " , " meta " )
def set_meta ( ) :
req = request . json
if not DocumentService . accessible ( req [ " doc_id " ] , current_user . id ) :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = " No authorization. " , code = settings . RetCode . AUTHENTICATION_ERROR )
2025-01-10 19:06:59 +08:00
try :
meta = json . loads ( req [ " meta " ] )
except Exception as e :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = f " Json syntax error: { e } " , code = settings . RetCode . ARGUMENT_ERROR )
2025-01-13 17:34:50 +08:00
if not isinstance ( meta , dict ) :
2025-05-19 14:54:06 +08:00
return get_json_result ( data = False , message = ' Meta data should be in Json map format, like { " key " : " value " } ' , code = settings . RetCode . ARGUMENT_ERROR )
2025-01-13 17:34:50 +08:00
2025-01-10 19:06:59 +08:00
try :
e , doc = DocumentService . get_by_id ( req [ " doc_id " ] )
if not e :
return get_data_error_result ( message = " Document not found! " )
2025-05-19 14:54:06 +08:00
if not DocumentService . update_by_id ( req [ " doc_id " ] , { " meta_fields " : meta } ) :
return get_data_error_result ( message = " Database error (meta updates)! " )
2025-01-10 19:06:59 +08:00
return get_json_result ( data = True )
except Exception as e :
return server_error_response ( e )