2025-04-29 16:53:57 +08:00
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
2025-05-09 19:17:08 +08:00
import uuid
2025-04-29 16:53:57 +08:00
from enum import auto
2025-05-06 17:38:06 +08:00
from typing import Annotated , Any
2025-04-29 16:53:57 +08:00
2025-05-06 17:38:06 +08:00
from flask import Request
2025-05-09 19:17:08 +08:00
from pydantic import UUID1 , BaseModel , Field , StringConstraints , ValidationError , field_serializer , field_validator
2025-04-29 16:53:57 +08:00
from strenum import StrEnum
2025-05-06 17:38:06 +08:00
from werkzeug . exceptions import BadRequest , UnsupportedMediaType
from api . constants import DATASET_NAME_LIMIT
2025-05-09 19:17:08 +08:00
def validate_and_parse_json_request ( request : Request , validator : type [ BaseModel ] , * , extras : dict [ str , Any ] | None = None , exclude_unset : bool = False ) - > tuple [ dict [ str , Any ] | None , str | None ] :
"""
Validates and parses JSON requests through a multi - stage validation pipeline .
2025-05-06 17:38:06 +08:00
2025-05-09 19:17:08 +08:00
Implements a four - stage validation process :
2025-05-06 17:38:06 +08:00
1. Content - Type verification ( must be application / json )
2. JSON syntax validation
3. Payload structure type checking
4. Pydantic model validation with error formatting
Args :
request ( Request ) : Flask request object containing HTTP payload
2025-05-09 19:17:08 +08:00
validator ( type [ BaseModel ] ) : Pydantic model class for data validation
extras ( dict [ str , Any ] | None ) : Additional fields to merge into payload
before validation . These fields will be removed from the final output
exclude_unset ( bool ) : Whether to exclude fields that have not been explicitly set
2025-05-06 17:38:06 +08:00
Returns :
tuple [ Dict [ str , Any ] | None , str | None ] :
- First element :
- Validated dictionary on success
- None on validation failure
- Second element :
- None on success
- Diagnostic error message on failure
Raises :
2025-05-09 19:17:08 +08:00
UnsupportedMediaType : When Content - Type header is not application / json
2025-05-06 17:38:06 +08:00
BadRequest : For structural JSON syntax errors
ValidationError : When payload violates Pydantic schema rules
Examples :
2025-05-09 19:17:08 +08:00
>> > validate_and_parse_json_request ( valid_request , DatasetSchema )
( { " name " : " Dataset1 " , " format " : " csv " } , None )
>> > validate_and_parse_json_request ( xml_request , DatasetSchema )
( None , " Unsupported content type: Expected application/json, got text/xml " )
>> > validate_and_parse_json_request ( bad_json_request , DatasetSchema )
( None , " Malformed JSON syntax: Missing commas/brackets or invalid encoding " )
Notes :
1. Validation Priority :
- Content - Type verification precedes JSON parsing
- Structural validation occurs before schema validation
2. Extra fields added via ` extras ` parameter are automatically removed
from the final output after validation
2025-05-06 17:38:06 +08:00
"""
try :
payload = request . get_json ( ) or { }
except UnsupportedMediaType :
return None , f " Unsupported content type: Expected application/json, got { request . content_type } "
except BadRequest :
return None , " Malformed JSON syntax: Missing commas/brackets or invalid encoding "
if not isinstance ( payload , dict ) :
return None , f " Invalid request payload: expected object, got { type ( payload ) . __name__ } "
try :
2025-05-09 19:17:08 +08:00
if extras is not None :
payload . update ( extras )
2025-05-06 17:38:06 +08:00
validated_request = validator ( * * payload )
except ValidationError as e :
return None , format_validation_error_message ( e )
2025-05-09 19:17:08 +08:00
parsed_payload = validated_request . model_dump ( by_alias = True , exclude_unset = exclude_unset )
if extras is not None :
for key in list ( parsed_payload . keys ( ) ) :
if key in extras :
del parsed_payload [ key ]
2025-05-06 17:38:06 +08:00
return parsed_payload , None
2025-04-29 16:53:57 +08:00
2025-04-30 14:50:23 +08:00
def format_validation_error_message ( e : ValidationError ) - > str :
2025-05-09 19:17:08 +08:00
"""
Formats validation errors into a standardized string format .
2025-05-06 17:38:06 +08:00
Processes pydantic ValidationError objects to create human - readable error messages
containing field locations , error descriptions , and input values .
Args :
e ( ValidationError ) : The validation error instance containing error details
Returns :
str : Formatted error messages joined by newlines . Each line contains :
- Field path ( dot - separated )
- Error message
- Truncated input value ( max 128 chars )
Example :
>> > try :
. . . UserModel ( name = 123 , email = " invalid " )
. . . except ValidationError as e :
. . . print ( format_validation_error_message ( e ) )
Field : < name > - Message : < Input should be a valid string > - Value : < 123 >
Field : < email > - Message : < value is not a valid email address > - Value : < invalid >
"""
2025-04-29 16:53:57 +08:00
error_messages = [ ]
for error in e . errors ( ) :
field = " . " . join ( map ( str , error [ " loc " ] ) )
msg = error [ " msg " ]
input_val = error [ " input " ]
input_str = str ( input_val )
if len ( input_str ) > 128 :
input_str = input_str [ : 125 ] + " ... "
error_msg = f " Field: < { field } > - Message: < { msg } > - Value: < { input_str } > "
error_messages . append ( error_msg )
return " \n " . join ( error_messages )
class PermissionEnum ( StrEnum ) :
me = auto ( )
team = auto ( )
class ChunkMethodnEnum ( StrEnum ) :
naive = auto ( )
book = auto ( )
email = auto ( )
laws = auto ( )
manual = auto ( )
one = auto ( )
paper = auto ( )
picture = auto ( )
presentation = auto ( )
qa = auto ( )
table = auto ( )
tag = auto ( )
class GraphragMethodEnum ( StrEnum ) :
light = auto ( )
general = auto ( )
class Base ( BaseModel ) :
class Config :
extra = " forbid "
class RaptorConfig ( Base ) :
use_raptor : bool = Field ( default = False )
prompt : Annotated [
str ,
StringConstraints ( strip_whitespace = True , min_length = 1 ) ,
Field (
default = " Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following: \n {cluster_content} \n The above is the content you need to summarize. "
) ,
]
max_token : int = Field ( default = 256 , ge = 1 , le = 2048 )
threshold : float = Field ( default = 0.1 , ge = 0.0 , le = 1.0 )
max_cluster : int = Field ( default = 64 , ge = 1 , le = 1024 )
2025-04-30 14:50:23 +08:00
random_seed : int = Field ( default = 0 , ge = 0 )
2025-04-29 16:53:57 +08:00
class GraphragConfig ( Base ) :
use_graphrag : bool = Field ( default = False )
2025-05-06 17:38:06 +08:00
entity_types : list [ str ] = Field ( default_factory = lambda : [ " organization " , " person " , " geo " , " event " , " category " ] )
2025-04-29 16:53:57 +08:00
method : GraphragMethodEnum = Field ( default = GraphragMethodEnum . light )
community : bool = Field ( default = False )
resolution : bool = Field ( default = False )
class ParserConfig ( Base ) :
auto_keywords : int = Field ( default = 0 , ge = 0 , le = 32 )
auto_questions : int = Field ( default = 0 , ge = 0 , le = 10 )
chunk_token_num : int = Field ( default = 128 , ge = 1 , le = 2048 )
2025-04-30 17:43:42 +08:00
delimiter : str = Field ( default = r " \ n " , min_length = 1 )
2025-05-06 17:38:06 +08:00
graphrag : GraphragConfig | None = None
2025-04-29 16:53:57 +08:00
html4excel : bool = False
layout_recognize : str = " DeepDOC "
2025-05-06 17:38:06 +08:00
raptor : RaptorConfig | None = None
tag_kb_ids : list [ str ] = Field ( default_factory = list )
2025-04-29 16:53:57 +08:00
topn_tags : int = Field ( default = 1 , ge = 1 , le = 10 )
2025-05-06 17:38:06 +08:00
filename_embd_weight : float | None = Field ( default = None , ge = 0.0 , le = 1.0 )
task_page_size : int | None = Field ( default = None , ge = 1 )
pages : list [ list [ int ] ] | None = None
2025-04-29 16:53:57 +08:00
class CreateDatasetReq ( Base ) :
2025-05-06 17:38:06 +08:00
name : Annotated [ str , StringConstraints ( strip_whitespace = True , min_length = 1 , max_length = DATASET_NAME_LIMIT ) , Field ( . . . ) ]
avatar : str | None = Field ( default = None , max_length = 65535 )
description : str | None = Field ( default = None , max_length = 65535 )
2025-05-09 19:17:08 +08:00
embedding_model : Annotated [ str , StringConstraints ( strip_whitespace = True , max_length = 255 ) , Field ( default = " " , serialization_alias = " embd_id " ) ]
2025-04-29 16:53:57 +08:00
permission : Annotated [ PermissionEnum , StringConstraints ( strip_whitespace = True , min_length = 1 , max_length = 16 ) , Field ( default = PermissionEnum . me ) ]
chunk_method : Annotated [ ChunkMethodnEnum , StringConstraints ( strip_whitespace = True , min_length = 1 , max_length = 32 ) , Field ( default = ChunkMethodnEnum . naive , serialization_alias = " parser_id " ) ]
pagerank : int = Field ( default = 0 , ge = 0 , le = 100 )
2025-05-12 19:39:18 +08:00
parser_config : ParserConfig | None = Field ( default = None )
2025-04-29 16:53:57 +08:00
@field_validator ( " avatar " )
@classmethod
2025-05-09 19:17:08 +08:00
def validate_avatar_base64 ( cls , v : str | None ) - > str | None :
"""
Validates Base64 - encoded avatar string format and MIME type compliance .
2025-05-06 17:38:06 +08:00
Implements a three - stage validation workflow :
1. MIME prefix existence check
2. MIME type format validation
3. Supported type verification
Args :
v ( str ) : Raw avatar field value
Returns :
str : Validated Base64 string
Raises :
ValueError : For structural errors in these cases :
- Missing MIME prefix header
- Invalid MIME prefix format
- Unsupported image MIME type
Example :
` ` ` python
# Valid case
CreateDatasetReq ( avatar = " data:image/png;base64,iVBORw0KGg... " )
# Invalid cases
CreateDatasetReq ( avatar = " image/jpeg;base64,... " ) # Missing 'data:' prefix
CreateDatasetReq ( avatar = " data:video/mp4;base64,... " ) # Unsupported MIME type
` ` `
"""
2025-04-29 16:53:57 +08:00
if v is None :
return v
if " , " in v :
prefix , _ = v . split ( " , " , 1 )
if not prefix . startswith ( " data: " ) :
raise ValueError ( " Invalid MIME prefix format. Must start with ' data: ' " )
mime_type = prefix [ 5 : ] . split ( " ; " ) [ 0 ]
supported_mime_types = [ " image/jpeg " , " image/png " ]
if mime_type not in supported_mime_types :
raise ValueError ( f " Unsupported MIME type. Allowed: { supported_mime_types } " )
return v
else :
raise ValueError ( " Missing MIME prefix. Expected format: data:<mime>;base64,<data> " )
@field_validator ( " embedding_model " , mode = " after " )
@classmethod
def validate_embedding_model ( cls , v : str ) - > str :
2025-05-09 19:17:08 +08:00
"""
Validates embedding model identifier format compliance .
2025-05-06 17:38:06 +08:00
Validation pipeline :
1. Structural format verification
2. Component non - empty check
3. Value normalization
Args :
v ( str ) : Raw model identifier
Returns :
str : Validated < model_name > @ < provider > format
Raises :
ValueError : For these violations :
- Missing @ separator
- Empty model_name / provider
- Invalid component structure
Examples :
Valid : " text-embedding-3-large@openai "
Invalid : " invalid_model " ( no @ )
Invalid : " @openai " ( empty model_name )
Invalid : " text-embedding-3-large@ " ( empty provider )
"""
2025-04-29 16:53:57 +08:00
if " @ " not in v :
2025-05-06 17:38:06 +08:00
raise ValueError ( " Embedding model identifier must follow <model_name>@<provider> format " )
components = v . split ( " @ " , 1 )
if len ( components ) != 2 or not all ( components ) :
raise ValueError ( " Both model_name and provider must be non-empty strings " )
model_name , provider = components
if not model_name . strip ( ) or not provider . strip ( ) :
raise ValueError ( " Model name and provider cannot be whitespace-only strings " )
2025-04-29 16:53:57 +08:00
return v
@field_validator ( " permission " , mode = " before " )
@classmethod
2025-05-09 19:17:08 +08:00
def permission_auto_lowercase ( cls , v : Any ) - > Any :
"""
Normalize permission input to lowercase for consistent PermissionEnum matching .
2025-05-06 17:38:06 +08:00
Args :
2025-05-09 19:17:08 +08:00
v ( Any ) : Raw input value for the permission field
2025-05-06 17:38:06 +08:00
Returns :
Lowercase string if input is string type , otherwise returns original value
Behavior :
- Converts string inputs to lowercase ( e . g . , " ME " → " me " )
- Non - string values pass through unchanged
- Works in validation pre - processing stage ( before enum conversion )
"""
return v . lower ( ) if isinstance ( v , str ) else v
2025-04-29 16:53:57 +08:00
2025-05-12 19:39:18 +08:00
@field_validator ( " parser_config " , mode = " before " )
@classmethod
def normalize_empty_parser_config ( cls , v : Any ) - > Any :
"""
Normalizes empty parser configuration by converting empty dictionaries to None .
This validator ensures consistent handling of empty parser configurations across
the application by converting empty dicts to None values .
Args :
v ( Any ) : Raw input value for the parser config field
Returns :
Any : Returns None if input is an empty dict , otherwise returns the original value
Example :
>> > normalize_empty_parser_config ( { } )
None
>> > normalize_empty_parser_config ( { " key " : " value " } )
{ " key " : " value " }
"""
if v == { } :
return None
return v
2025-04-29 16:53:57 +08:00
@field_validator ( " parser_config " , mode = " after " )
@classmethod
2025-05-12 19:39:18 +08:00
def validate_parser_config_json_length ( cls , v : ParserConfig | None ) - > ParserConfig | None :
2025-05-09 19:17:08 +08:00
"""
Validates serialized JSON length constraints for parser configuration .
2025-05-06 17:38:06 +08:00
2025-05-09 19:17:08 +08:00
Implements a two - stage validation workflow :
2025-05-12 19:39:18 +08:00
1. Null check - bypass validation for empty configurations
2. Model serialization - convert Pydantic model to JSON string
3. Size verification - enforce maximum allowed payload size
2025-05-06 17:38:06 +08:00
Args :
v ( ParserConfig | None ) : Raw parser configuration object
Returns :
ParserConfig | None : Validated configuration object
Raises :
ValueError : When serialized JSON exceeds 65 , 535 characters
"""
2025-05-12 19:39:18 +08:00
if v is None :
return None
2025-05-06 17:38:06 +08:00
if ( json_str := v . model_dump_json ( ) ) and len ( json_str ) > 65535 :
raise ValueError ( f " Parser config exceeds size limit (max 65,535 characters). Current size: { len ( json_str ) : , } " )
2025-04-29 16:53:57 +08:00
return v
2025-05-09 19:17:08 +08:00
class UpdateDatasetReq ( CreateDatasetReq ) :
dataset_id : UUID1 = Field ( . . . )
name : Annotated [ str , StringConstraints ( strip_whitespace = True , min_length = 1 , max_length = DATASET_NAME_LIMIT ) , Field ( default = " " ) ]
@field_serializer ( " dataset_id " )
def serialize_uuid_to_hex ( self , v : uuid . UUID ) - > str :
return v . hex