mirror of
https://github.com/microsoft/autogen.git
synced 2025-07-24 09:20:52 +00:00

<!-- Thank you for your contribution! Please review https://microsoft.github.io/autogen/docs/Contribute before opening a pull request. --> <!-- Please add a reviewer to the assignee section when you create a PR. If you don't have the access to it, we will shortly find a reviewer and assign them to your PR. --> ## Why are these changes needed? https://github.com/user-attachments/assets/e160f16d-f42d-49e2-a6c6-687e4e6786f4 Enable file upload/paste as a task in AGS. Enables tasks like - Can you research and fact check the ideas in this screenshot? - Summarize this file Only text and images supported for now Underneath, it constructs TextMessage and Multimodal messages as the task. <!-- Please give a short summary of the change and the problem this solves. --> ## Related issue number <!-- For example: "Closes #1234" --> Closes #5773 ## Checks - [ ] I've included any doc changes needed for <https://microsoft.github.io/autogen/>. See <https://github.com/microsoft/autogen/blob/main/CONTRIBUTING.md> to build and test documentation locally. - [ ] I've added tests (if relevant) corresponding to the changes introduced in this PR. - [ ] I've made sure all auto checks have passed. --------- Co-authored-by: Jack Gerrits <jackgerrits@users.noreply.github.com>
83 lines
2.0 KiB
Python
83 lines
2.0 KiB
Python
# datamodel/eval.py
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from typing import Any, Dict, List, Optional, Sequence
|
|
from uuid import UUID, uuid4
|
|
|
|
from autogen_agentchat.base import TaskResult
|
|
from autogen_core import Image
|
|
from pydantic import BaseModel
|
|
from sqlmodel import Field
|
|
|
|
|
|
class EvalTask(BaseModel):
|
|
"""Definition of a task to be evaluated."""
|
|
|
|
task_id: UUID | str = Field(default_factory=uuid4)
|
|
input: str | Sequence[str | Image]
|
|
name: str = ""
|
|
description: str = ""
|
|
expected_outputs: Optional[List[Any]] = None
|
|
metadata: Dict[str, Any] = {}
|
|
|
|
|
|
class EvalRunResult(BaseModel):
|
|
"""Result of an evaluation run."""
|
|
|
|
result: TaskResult | None = None
|
|
status: bool = False
|
|
start_time: Optional[datetime] = Field(default=datetime.now())
|
|
end_time: Optional[datetime] = None
|
|
error: Optional[str] = None
|
|
|
|
|
|
class EvalDimensionScore(BaseModel):
|
|
"""Score for a single evaluation dimension."""
|
|
|
|
dimension: str
|
|
score: float
|
|
reason: str
|
|
max_value: float
|
|
min_value: float
|
|
|
|
|
|
class EvalScore(BaseModel):
|
|
"""Composite score from evaluation."""
|
|
|
|
overall_score: Optional[float] = None
|
|
dimension_scores: List[EvalDimensionScore] = []
|
|
reason: Optional[str] = None
|
|
max_value: float = 10.0
|
|
min_value: float = 0.0
|
|
metadata: Dict[str, Any] = {}
|
|
|
|
|
|
class EvalJudgeCriteria(BaseModel):
|
|
"""Criteria for judging evaluation results."""
|
|
|
|
dimension: str
|
|
prompt: str
|
|
max_value: float = 10.0
|
|
min_value: float = 0.0
|
|
metadata: Dict[str, Any] = {}
|
|
|
|
|
|
class EvalRunStatus(str, Enum):
|
|
"""Status of an evaluation run."""
|
|
|
|
PENDING = "pending"
|
|
RUNNING = "running"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
CANCELED = "canceled"
|
|
|
|
|
|
class EvalResult(BaseModel):
|
|
"""Result of an evaluation run."""
|
|
|
|
task_id: UUID | str
|
|
# runner_id: UUID | str
|
|
status: EvalRunStatus = EvalRunStatus.PENDING
|
|
start_time: Optional[datetime] = Field(default=datetime.now())
|
|
end_time: Optional[datetime] = None
|