mirror of
https://github.com/microsoft/autogen.git
synced 2025-07-26 02:11:24 +00:00
83 lines
2.0 KiB
Python
83 lines
2.0 KiB
Python
![]() |
# datamodel/eval.py
|
||
|
from datetime import datetime
|
||
|
from enum import Enum
|
||
|
from typing import Any, Dict, List, Optional, Sequence
|
||
|
from uuid import UUID, uuid4
|
||
|
|
||
|
from autogen_agentchat.base import TaskResult
|
||
|
from autogen_core import Image
|
||
|
from pydantic import BaseModel
|
||
|
from sqlmodel import Field
|
||
|
|
||
|
|
||
|
class EvalTask(BaseModel):
|
||
|
"""Definition of a task to be evaluated."""
|
||
|
|
||
|
task_id: UUID | str = Field(default_factory=uuid4)
|
||
|
input: str | Sequence[str | Image]
|
||
|
name: str = ""
|
||
|
description: str = ""
|
||
|
expected_outputs: Optional[List[Any]] = None
|
||
|
metadata: Dict[str, Any] = {}
|
||
|
|
||
|
|
||
|
class EvalRunResult(BaseModel):
|
||
|
"""Result of an evaluation run."""
|
||
|
|
||
|
result: TaskResult | None = None
|
||
|
status: bool = False
|
||
|
start_time: Optional[datetime] = Field(default=datetime.now())
|
||
|
end_time: Optional[datetime] = None
|
||
|
error: Optional[str] = None
|
||
|
|
||
|
|
||
|
class EvalDimensionScore(BaseModel):
|
||
|
"""Score for a single evaluation dimension."""
|
||
|
|
||
|
dimension: str
|
||
|
score: float
|
||
|
reason: str
|
||
|
max_value: float
|
||
|
min_value: float
|
||
|
|
||
|
|
||
|
class EvalScore(BaseModel):
|
||
|
"""Composite score from evaluation."""
|
||
|
|
||
|
overall_score: Optional[float] = None
|
||
|
dimension_scores: List[EvalDimensionScore] = []
|
||
|
reason: Optional[str] = None
|
||
|
max_value: float = 10.0
|
||
|
min_value: float = 0.0
|
||
|
metadata: Dict[str, Any] = {}
|
||
|
|
||
|
|
||
|
class EvalJudgeCriteria(BaseModel):
|
||
|
"""Criteria for judging evaluation results."""
|
||
|
|
||
|
dimension: str
|
||
|
prompt: str
|
||
|
max_value: float = 10.0
|
||
|
min_value: float = 0.0
|
||
|
metadata: Dict[str, Any] = {}
|
||
|
|
||
|
|
||
|
class EvalRunStatus(str, Enum):
|
||
|
"""Status of an evaluation run."""
|
||
|
|
||
|
PENDING = "pending"
|
||
|
RUNNING = "running"
|
||
|
COMPLETED = "completed"
|
||
|
FAILED = "failed"
|
||
|
CANCELED = "canceled"
|
||
|
|
||
|
|
||
|
class EvalResult(BaseModel):
|
||
|
"""Result of an evaluation run."""
|
||
|
|
||
|
task_id: UUID | str
|
||
|
# runner_id: UUID | str
|
||
|
status: EvalRunStatus = EvalRunStatus.PENDING
|
||
|
start_time: Optional[datetime] = Field(default=datetime.now())
|
||
|
end_time: Optional[datetime] = None
|