Fix: create dataset remove unnecessary parameter constraints (#7432)

### What problem does this PR solve?

Remove unnecessary parameter restrictions in dataset creation API

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
liu an 2025-04-30 14:50:23 +08:00 committed by GitHub
parent e6c824e606
commit 1f82889001
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 5 additions and 16 deletions

View File

@ -20,7 +20,7 @@ from pydantic import BaseModel, Field, StringConstraints, ValidationError, field
from strenum import StrEnum
def format_validation_error_message(e: ValidationError):
def format_validation_error_message(e: ValidationError) -> str:
error_messages = []
for error in e.errors():
@ -81,7 +81,7 @@ class RaptorConfig(Base):
max_token: int = Field(default=256, ge=1, le=2048)
threshold: float = Field(default=0.1, ge=0.0, le=1.0)
max_cluster: int = Field(default=64, ge=1, le=1024)
random_seed: int = Field(default=0, ge=0, le=10_000)
random_seed: int = Field(default=0, ge=0)
class GraphragConfig(Base):
@ -104,7 +104,7 @@ class ParserConfig(Base):
tag_kb_ids: List[str] = Field(default_factory=list)
topn_tags: int = Field(default=1, ge=1, le=10)
filename_embd_weight: Optional[float] = Field(default=None, ge=0.0, le=1.0)
task_page_size: Optional[int] = Field(default=None, ge=1, le=10_000)
task_page_size: Optional[int] = Field(default=None, ge=1)
pages: Optional[List[List[int]]] = None

View File

@ -429,7 +429,6 @@ curl --request POST \
- `"task_page_size"`: `int` For PDF only.
- Defaults to `12`
- Minimum: `1`
- Maximum: `10000`
- `"raptor"`: `object` RAPTOR-specific settings.
- Defaults to: `{"use_raptor": false}`
- `"graphrag"`: `object` GRAPHRAG-specific settings.

View File

@ -311,8 +311,7 @@ class TestDatasetCreation:
("filename_embd_weight_mid", {"filename_embd_weight": 0.5}),
("filename_embd_weight_max", {"filename_embd_weight": 1.0}),
("task_page_size_min", {"task_page_size": 1}),
("task_page_size_mid", {"task_page_size": 5_000}),
("task_page_size_max", {"task_page_size": 10_000}),
("task_page_size_None", {"task_page_size": None}),
("pages", {"pages": [[1, 100]]}),
("pages_none", None),
("graphrag_true", {"graphrag": {"use_graphrag": True}}),
@ -337,8 +336,6 @@ class TestDatasetCreation:
("raptor_max_cluster_mid", {"raptor": {"max_cluster": 512}}),
("raptor_max_cluster_max", {"raptor": {"max_cluster": 1024}}),
("raptor_random_seed_min", {"raptor": {"random_seed": 0}}),
("raptor_random_seed_mid", {"raptor": {"random_seed": 5_000}}),
("raptor_random_seed_max", {"raptor": {"random_seed": 10_000}}),
],
ids=[
"default_none",
@ -366,8 +363,7 @@ class TestDatasetCreation:
"filename_embd_weight_mid",
"filename_embd_weight_max",
"task_page_size_min",
"task_page_size_mid",
"task_page_size_max",
"task_page_size_None",
"pages",
"pages_none",
"graphrag_true",
@ -392,8 +388,6 @@ class TestDatasetCreation:
"raptor_max_cluster_mid",
"raptor_max_cluster_max",
"raptor_random_seed_min",
"raptor_random_seed_mid",
"raptor_random_seed_max",
],
)
def test_valid_parser_config(self, get_http_api_auth, name, parser_config):
@ -462,7 +456,6 @@ class TestDatasetCreation:
("filename_embd_weight_max_limit", {"filename_embd_weight": 1.1}, "Input should be less than or equal to 1"),
("filename_embd_weight_type_invalid", {"filename_embd_weight": "string"}, "Input should be a valid number, unable to parse string as a number"),
("task_page_size_min_limit", {"task_page_size": 0}, "Input should be greater than or equal to 1"),
("task_page_size_max_limit", {"task_page_size": 10_001}, "Input should be less than or equal to 10000"),
("task_page_size_float_not_allowed", {"task_page_size": 3.14}, "Input should be a valid integer, got a number with a fractional part"),
("task_page_size_type_invalid", {"task_page_size": "string"}, "Input should be a valid integer, unable to parse string as an integer"),
("pages_not_list", {"pages": "1,2"}, "Input should be a valid list"),
@ -490,7 +483,6 @@ class TestDatasetCreation:
("raptor_max_cluster_float_not_allowed", {"raptor": {"max_cluster": 3.14}}, "Input should be a valid integer, got a number with a fractional par"),
("raptor_max_cluster_type_invalid", {"raptor": {"max_cluster": "string"}}, "Input should be a valid integer, unable to parse string as an integer"),
("raptor_random_seed_min_limit", {"raptor": {"random_seed": -1}}, "Input should be greater than or equal to 0"),
("raptor_random_seed_max_limit", {"raptor": {"random_seed": 10_001}}, "Input should be less than or equal to 10000"),
("raptor_random_seed_float_not_allowed", {"raptor": {"random_seed": 3.14}}, "Input should be a valid integer, got a number with a fractional part"),
("raptor_random_seed_type_invalid", {"raptor": {"random_seed": "string"}}, "Input should be a valid integer, unable to parse string as an integer"),
("parser_config_type_invalid", {"delimiter": "a" * 65536}, "Parser config have at most 65535 characters"),
@ -520,7 +512,6 @@ class TestDatasetCreation:
"filename_embd_weight_max_limit",
"filename_embd_weight_type_invalid",
"task_page_size_min_limit",
"task_page_size_max_limit",
"task_page_size_float_not_allowed",
"task_page_size_type_invalid",
"pages_not_list",
@ -548,7 +539,6 @@ class TestDatasetCreation:
"raptor_max_cluster_float_not_allowed",
"raptor_max_cluster_type_invalid",
"raptor_random_seed_min_limit",
"raptor_random_seed_max_limit",
"raptor_random_seed_float_not_allowed",
"raptor_random_seed_type_invalid",
"parser_config_type_invalid",