mirror of
https://github.com/microsoft/autogen.git
synced 2025-12-09 13:57:37 +00:00
Improvements to agbench (#5776)
1. Add host network support in Docker and remove unused requirements
from argument check.
2. Use Pandas to simplify summary statistic calculations.
3. Add running time to summary statistics
```
Using tabulation method defined in '/home/ekzhu/autogen/python/packages/agbench/benchmarks/HumanEval/Scripts/custom_tabulate.py'
Task Id Trial 0 Success Trial 0 Time
-- ------------ ----------------- --------------
0 HumanEval_0 True 3
1 HumanEval_1 False 15
2 HumanEval_2 True 2
3 HumanEval_3 True 11
4 HumanEval_4 True 4
5 HumanEval_5 True 2
6 HumanEval_6 False 18
7 HumanEval_7 True 2
8 HumanEval_8 True 2
9 HumanEval_9 True 12
10 HumanEval_10 False 11
11 HumanEval_11 True 2
12 HumanEval_12 True 3
13 HumanEval_13 True 1
14 HumanEval_14 True 4
15 HumanEval_15 True 1
16 HumanEval_16 True 2
17 HumanEval_17 False 76
18 HumanEval_18 True 4
19 HumanEval_19 True 3
20 HumanEval_20 True 5
21 HumanEval_21 True 3
22 HumanEval_22 True 1
23 HumanEval_23 True 2
24 HumanEval_24 nan
Summary Statistics
Successes Failures Missing Total Average Success Rate Average Time Total Time
------- ----------- ---------- --------- ------- ---------------------- -------------- ------------
Trial 0 20 4 1 25 0.8 7.875 189
CAUTION: 'autogenbench tabulate' is in early preview and is not thoroughly tested.
Please do not cite values from these calculations in academic work without first inspecting and verifying the results in the run logs yourself.
```
Now the default tabulate output looks like this
---------
Co-authored-by: Ryan Sweet <rysweet@microsoft.com>
This commit is contained in:
parent
aba41d74d3
commit
483532180a
@ -0,0 +1,17 @@
|
|||||||
|
from typing import List
|
||||||
|
from autogen_core.model_context import UnboundedChatCompletionContext
|
||||||
|
from autogen_core.models import AssistantMessage, LLMMessage
|
||||||
|
|
||||||
|
|
||||||
|
class ReasoningModelContext(UnboundedChatCompletionContext):
|
||||||
|
"""A model context for reasoning models."""
|
||||||
|
|
||||||
|
async def get_messages(self) -> List[LLMMessage]:
|
||||||
|
messages = await super().get_messages()
|
||||||
|
# Filter out thought field from AssistantMessage.
|
||||||
|
messages_out = []
|
||||||
|
for message in messages:
|
||||||
|
if isinstance(message, AssistantMessage):
|
||||||
|
message.thought = None
|
||||||
|
messages_out.append(message)
|
||||||
|
return messages_out
|
||||||
@ -5,9 +5,11 @@ from autogen_ext.agents.magentic_one import MagenticOneCoderAgent
|
|||||||
from autogen_agentchat.teams import RoundRobinGroupChat
|
from autogen_agentchat.teams import RoundRobinGroupChat
|
||||||
from autogen_agentchat.ui import Console
|
from autogen_agentchat.ui import Console
|
||||||
from autogen_core.models import ModelFamily
|
from autogen_core.models import ModelFamily
|
||||||
|
from autogen_core.model_context import UnboundedChatCompletionContext, ChatCompletionContext
|
||||||
from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
|
from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
|
||||||
from autogen_agentchat.conditions import TextMentionTermination
|
from autogen_agentchat.conditions import TextMentionTermination
|
||||||
from custom_code_executor import CustomCodeExecutorAgent
|
from custom_code_executor import CustomCodeExecutorAgent
|
||||||
|
from reasoning_model_context import ReasoningModelContext
|
||||||
from autogen_core.models import ChatCompletionClient
|
from autogen_core.models import ChatCompletionClient
|
||||||
|
|
||||||
async def main() -> None:
|
async def main() -> None:
|
||||||
@ -17,11 +19,20 @@ async def main() -> None:
|
|||||||
config = yaml.safe_load(f)
|
config = yaml.safe_load(f)
|
||||||
model_client = ChatCompletionClient.load_component(config["model_config"])
|
model_client = ChatCompletionClient.load_component(config["model_config"])
|
||||||
|
|
||||||
|
# Model context
|
||||||
|
model_context : ChatCompletionContext
|
||||||
|
if model_client.model_info["family"] == ModelFamily.R1:
|
||||||
|
model_context = ReasoningModelContext()
|
||||||
|
else:
|
||||||
|
model_context = UnboundedChatCompletionContext()
|
||||||
|
|
||||||
# Coder
|
# Coder
|
||||||
coder_agent = MagenticOneCoderAgent(
|
coder_agent = MagenticOneCoderAgent(
|
||||||
name="coder",
|
name="coder",
|
||||||
model_client=model_client,
|
model_client=model_client,
|
||||||
)
|
)
|
||||||
|
# Set model context.
|
||||||
|
coder_agent._model_context = model_context # type: ignore
|
||||||
|
|
||||||
# Executor
|
# Executor
|
||||||
executor = CustomCodeExecutorAgent(
|
executor = CustomCodeExecutorAgent(
|
||||||
|
|||||||
@ -426,13 +426,17 @@ fi
|
|||||||
# Run the scenario
|
# Run the scenario
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
echo SCENARIO.PY STARTING !#!#
|
echo SCENARIO.PY STARTING !#!#
|
||||||
|
start_time=$(date +%s)
|
||||||
timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py
|
timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py
|
||||||
|
end_time=$(date +%s)
|
||||||
EXIT_CODE=$?
|
EXIT_CODE=$?
|
||||||
if [ $EXIT_CODE -ne 0 ]; then
|
if [ $EXIT_CODE -ne 0 ]; then
|
||||||
echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
|
echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
|
||||||
else
|
else
|
||||||
echo SCENARIO.PY COMPLETE !#!#
|
echo SCENARIO.PY COMPLETE !#!#
|
||||||
fi
|
fi
|
||||||
|
elapsed_time=$((end_time - start_time))
|
||||||
|
echo "SCENARIO.PY RUNTIME: $elapsed_time !#!#"
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
if [ -d .cache ] ; then
|
if [ -d .cache ] ; then
|
||||||
@ -543,13 +547,17 @@ fi
|
|||||||
# Run the scenario
|
# Run the scenario
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
echo SCENARIO.PY STARTING !#!#
|
echo SCENARIO.PY STARTING !#!#
|
||||||
|
start_time=$(date +%s)
|
||||||
timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py
|
timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py
|
||||||
|
end_time=$(date +%s)
|
||||||
EXIT_CODE=$?
|
EXIT_CODE=$?
|
||||||
if [ $EXIT_CODE -ne 0 ]; then
|
if [ $EXIT_CODE -ne 0 ]; then
|
||||||
echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
|
echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
|
||||||
else
|
else
|
||||||
echo SCENARIO.PY COMPLETE !#!#
|
echo SCENARIO.PY COMPLETE !#!#
|
||||||
fi
|
fi
|
||||||
|
elapsed_time=$((end_time - start_time))
|
||||||
|
echo "SCENARIO.PY RUNTIME: $elapsed_time !#!#"
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
if [ -d .cache ] ; then
|
if [ -d .cache ] ; then
|
||||||
@ -613,6 +621,7 @@ echo RUN.SH COMPLETE !#!#
|
|||||||
auto_remove=True,
|
auto_remove=True,
|
||||||
# Type hint of docker is wrong here
|
# Type hint of docker is wrong here
|
||||||
volumes=volumes, # type: ignore
|
volumes=volumes, # type: ignore
|
||||||
|
network="host", # Use the host network to avoid issues with localhost.
|
||||||
)
|
)
|
||||||
|
|
||||||
# Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
|
# Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
|
||||||
@ -930,9 +939,6 @@ def run_cli(args: Sequence[str]) -> None:
|
|||||||
if IS_WIN32:
|
if IS_WIN32:
|
||||||
sys.exit("Running scenarios with --native is not supported in Windows. Exiting.")
|
sys.exit("Running scenarios with --native is not supported in Windows. Exiting.")
|
||||||
|
|
||||||
if parsed_args.requirements is not None:
|
|
||||||
sys.exit("--requirements is not compatible with --native. Exiting.")
|
|
||||||
|
|
||||||
sys.stderr.write(
|
sys.stderr.write(
|
||||||
"WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n"
|
"WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n"
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,9 +1,10 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
from copy import deepcopy
|
from typing import Any, Callable, Dict, List, Optional, Sequence
|
||||||
from typing import Any, Callable, List, Optional, Sequence, Tuple
|
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
import tabulate as tb
|
import tabulate as tb
|
||||||
|
|
||||||
from .load_module import load_module
|
from .load_module import load_module
|
||||||
@ -25,6 +26,8 @@ COMPLETED_STRINGS = [
|
|||||||
|
|
||||||
EXCLUDE_DIR_NAMES = ["__pycache__"]
|
EXCLUDE_DIR_NAMES = ["__pycache__"]
|
||||||
|
|
||||||
|
TIMER_REGEX = r"RUNTIME:\s*([\d.]+) !#!#"
|
||||||
|
|
||||||
|
|
||||||
def find_tabulate_module(search_dir: str, stop_dir: Optional[str] = None) -> Optional[str]:
|
def find_tabulate_module(search_dir: str, stop_dir: Optional[str] = None) -> Optional[str]:
|
||||||
"""Hunt for the tabulate script."""
|
"""Hunt for the tabulate script."""
|
||||||
@ -84,12 +87,32 @@ def default_scorer(instance_dir: str, success_strings: List[str] = SUCCESS_STRIN
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def default_timer(instance_dir: str, timer_regex: str = TIMER_REGEX) -> Optional[float]:
|
||||||
|
console_log = os.path.join(instance_dir, "console_log.txt")
|
||||||
|
if os.path.isfile(console_log):
|
||||||
|
with open(console_log, "rt") as fh:
|
||||||
|
content = fh.read()
|
||||||
|
|
||||||
|
# It succeeded
|
||||||
|
m = re.search(timer_regex, content)
|
||||||
|
if m:
|
||||||
|
return float(m.group(1))
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
ScorerFunc = Callable[[str], Optional[bool]]
|
ScorerFunc = Callable[[str], Optional[bool]]
|
||||||
|
TimerFunc = Callable[[str], Optional[float]]
|
||||||
|
|
||||||
|
|
||||||
def default_tabulate(
|
def default_tabulate(
|
||||||
args: List[str], scorer: ScorerFunc = default_scorer, exclude_dir_names: List[str] = EXCLUDE_DIR_NAMES
|
args: List[str],
|
||||||
) -> Tuple[argparse.Namespace, List[List[Any]]]:
|
scorer: ScorerFunc = default_scorer,
|
||||||
|
timer: TimerFunc = default_timer,
|
||||||
|
exclude_dir_names: List[str] = EXCLUDE_DIR_NAMES,
|
||||||
|
) -> None:
|
||||||
invocation_cmd = args[0]
|
invocation_cmd = args[0]
|
||||||
args = args[1:]
|
args = args[1:]
|
||||||
|
|
||||||
@ -119,7 +142,7 @@ def default_tabulate(
|
|||||||
parsed_args = parser.parse_args(args)
|
parsed_args = parser.parse_args(args)
|
||||||
runlogs: str = parsed_args.runlogs
|
runlogs: str = parsed_args.runlogs
|
||||||
|
|
||||||
all_results: List[List[Any]] = list()
|
all_results: List[Dict[str, Any]] = list()
|
||||||
max_instances = 0
|
max_instances = 0
|
||||||
|
|
||||||
for task_id in sorted(
|
for task_id in sorted(
|
||||||
@ -135,116 +158,101 @@ def default_tabulate(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Collect the results vector
|
# Collect the results vector
|
||||||
results: List[Any] = [task_id]
|
results: Dict[str, Any] = {"Task Id": task_id}
|
||||||
|
|
||||||
instance = 0
|
# Collect the results for each instance.
|
||||||
instance_dir = os.path.join(task_path, str(instance))
|
instance_dirs = sorted(
|
||||||
while os.path.isdir(instance_dir):
|
os.listdir(task_path),
|
||||||
results.append(scorer(instance_dir))
|
key=lambda s: os.path.getmtime(os.path.join(task_path, s)),
|
||||||
instance += 1
|
)
|
||||||
|
instances = [int(d) for d in instance_dirs if d.isdigit()]
|
||||||
|
|
||||||
|
for instance in instances:
|
||||||
instance_dir = os.path.join(task_path, str(instance))
|
instance_dir = os.path.join(task_path, str(instance))
|
||||||
|
results[f"Trial {instance} Success"] = scorer(instance_dir)
|
||||||
|
results[f"Trial {instance} Time"] = timer(instance_dir)
|
||||||
|
|
||||||
max_instances = max(max_instances, instance)
|
max_instances = max(instances)
|
||||||
|
|
||||||
# Buffer the results
|
# Buffer the results
|
||||||
all_results.append(results)
|
all_results.append(results)
|
||||||
|
|
||||||
|
num_instances = max_instances + 1
|
||||||
|
|
||||||
|
# Pad the results to max_instances
|
||||||
|
for result in all_results:
|
||||||
|
for i in range(num_instances):
|
||||||
|
if f"Trial {i} Success" not in result:
|
||||||
|
result[f"Trial {i} Success"] = None
|
||||||
|
if f"Trial {i} Time" not in result:
|
||||||
|
result[f"Trial {i} Time"] = None
|
||||||
|
|
||||||
|
# Create dataframe from results.
|
||||||
|
df = pd.DataFrame(all_results)
|
||||||
|
|
||||||
if parsed_args.csv:
|
if parsed_args.csv:
|
||||||
# Create a header
|
# Print out the dataframe in CSV format
|
||||||
header = ["Task Id"]
|
print(df.to_csv(index=False))
|
||||||
for i in range(0, max_instances):
|
|
||||||
header.append("Trial " + str(i) + " Success")
|
|
||||||
|
|
||||||
print(",".join(header))
|
|
||||||
for row in all_results:
|
|
||||||
str_row = [f"{v}" if v is not None else "" for v in row]
|
|
||||||
while len(str_row) < max_instances + 1:
|
|
||||||
str_row.append("")
|
|
||||||
print(",".join(str_row))
|
|
||||||
|
|
||||||
# Print out alpha-version warning
|
# Print out alpha-version warning
|
||||||
sys.stderr.write("\n" + warning + "\n\n")
|
sys.stderr.write("\n" + warning + "\n\n")
|
||||||
else:
|
else:
|
||||||
# Create a header
|
# Tabulate the results.
|
||||||
header = ["\nTask Id"]
|
print(tb.tabulate(df, headers="keys", tablefmt="simple")) # type: ignore
|
||||||
for i in range(0, max_instances):
|
|
||||||
header.append("Trial " + str(i) + "\nSuccess")
|
|
||||||
|
|
||||||
# Create the footer
|
# Aggregate statistics for all tasks for each trials.
|
||||||
def _count_equals(value: Optional[bool], trial: int) -> int:
|
print("\nSummary Statistics\n")
|
||||||
count = 0
|
score_columns = ["Trial " + str(i) + " Success" for i in range(num_instances)]
|
||||||
for row in all_results:
|
# Count the number of successes when the value is True.
|
||||||
is_answer_matched = row[trial + 1][0] if isinstance(row[trial + 1], tuple) else row[trial + 1]
|
successes = df[score_columns].apply(lambda x: x is True).sum(axis=0) # type: ignore
|
||||||
|
# Count the number of failures when the value is False.
|
||||||
|
failures: pd.Series = df[score_columns].apply(lambda x: x is False).sum(axis=0) # type: ignore
|
||||||
|
# Count the number of missing
|
||||||
|
missings = df[score_columns].isna().sum(axis=0) # type: ignore
|
||||||
|
# Count the total number of instances
|
||||||
|
totals = successes + failures + missings # type: ignore
|
||||||
|
# Calculate the average success rates
|
||||||
|
avg_success_rates = successes / (successes + failures) # type: ignore
|
||||||
|
time_columns = ["Trial " + str(i) + " Time" for i in range(num_instances)] # type: ignore
|
||||||
|
# Count the total time of non-null values
|
||||||
|
total_times = df[time_columns].sum(axis=0, skipna=True) # type: ignore
|
||||||
|
# Calculate the average time of non-null values
|
||||||
|
avg_times = df[time_columns].mean(axis=0, skipna=True) # type: ignore
|
||||||
|
|
||||||
# Count missing
|
# Create a per-trial summary dataframe
|
||||||
if value is None:
|
trial_df = pd.DataFrame(
|
||||||
if trial + 1 < len(row):
|
{
|
||||||
if is_answer_matched is None:
|
"Successes": list(successes), # type: ignore
|
||||||
count += 1
|
"Failures": list(failures), # type: ignore
|
||||||
else:
|
"Missing": list(missings), # type: ignore
|
||||||
count += 1
|
"Total": list(totals), # type: ignore
|
||||||
# Count match
|
"Average Success Rate": list(avg_success_rates), # type: ignore
|
||||||
elif trial + 1 < len(row) and is_answer_matched == value:
|
"Average Time": list(avg_times), # type: ignore
|
||||||
count += 1
|
"Total Time": list(total_times), # type: ignore
|
||||||
return count
|
},
|
||||||
|
index=[f"Trial {i}" for i in range(num_instances)],
|
||||||
|
)
|
||||||
|
# Print out the per-trial summary dataframe.
|
||||||
|
print(tb.tabulate(trial_df, headers="keys", tablefmt="simple")) # type: ignore
|
||||||
|
|
||||||
footer: List[Any] = []
|
# Aggregate statistics across tasks for all trials.
|
||||||
footer_row: List[Any] = ["Successes"]
|
# At least one success for each trial, averaged across tasks.
|
||||||
for i in range(0, max_instances):
|
average_at_least_one_success = df[score_columns].any(axis=1).mean(skipna=True) # type: ignore
|
||||||
footer_row.append(_count_equals(True, i))
|
# All successes for each trial
|
||||||
footer.append(footer_row)
|
average_all_successes = df[score_columns].all(axis=1).mean(skipna=True) # type: ignore
|
||||||
|
|
||||||
footer_row = ["Failures"]
|
# Create a dataframe
|
||||||
for i in range(0, max_instances):
|
trial_aggregated_df = pd.DataFrame(
|
||||||
# count how many are not True, and not None, could be False or any other value
|
{
|
||||||
failures = 0
|
"At Least One Success": [average_at_least_one_success], # type: ignore
|
||||||
for row in all_results:
|
"All Successes": [average_all_successes], # type: ignore
|
||||||
if isinstance(row[i + 1], tuple):
|
},
|
||||||
failures += row[i + 1][0] not in [1, None]
|
index=["Trial Aggregated"],
|
||||||
else:
|
)
|
||||||
failures += row[i + 1] not in [1, None]
|
# Print out the trial-aggregated dataframe.
|
||||||
footer_row.append(failures)
|
print(tb.tabulate(trial_aggregated_df, headers="keys", tablefmt="simple")) # type: ignore
|
||||||
footer.append(footer_row)
|
|
||||||
|
|
||||||
footer_row = ["Missing"]
|
|
||||||
for i in range(0, max_instances):
|
|
||||||
footer_row.append(_count_equals(None, i))
|
|
||||||
footer.append(footer_row)
|
|
||||||
|
|
||||||
footer_row = ["Total"]
|
|
||||||
for i in range(0, max_instances):
|
|
||||||
footer_row.append(footer[0][i + 1] + footer[1][i + 1] + footer[2][i + 1])
|
|
||||||
footer.append(footer_row)
|
|
||||||
|
|
||||||
footer_row = ["Average Success Rate"]
|
|
||||||
for i in range(0, max_instances):
|
|
||||||
footer_row.append(_count_equals(True, i) / (footer[0][i + 1] + footer[1][i + 1] + footer[2][i + 1]))
|
|
||||||
footer.append(footer_row)
|
|
||||||
|
|
||||||
footer_row = ["Average Score"]
|
|
||||||
for i in range(0, max_instances):
|
|
||||||
avg_score_trial = 0.0
|
|
||||||
for row in all_results:
|
|
||||||
if isinstance(row[i + 1], tuple):
|
|
||||||
avg_score_trial += row[i + 1][0]
|
|
||||||
avg_score_trial = avg_score_trial / len(all_results)
|
|
||||||
footer_row.append(avg_score_trial)
|
|
||||||
footer.append(footer_row)
|
|
||||||
|
|
||||||
table = deepcopy(all_results)
|
|
||||||
for row in table:
|
|
||||||
for trial in range(0, max_instances):
|
|
||||||
if isinstance(row[trial + 1], tuple):
|
|
||||||
row[trial + 1] = row[trial + 1][0]
|
|
||||||
|
|
||||||
table.append(tb.SEPARATING_LINE) # type: ignore
|
|
||||||
table.extend(footer)
|
|
||||||
|
|
||||||
print(tb.tabulate(table, headers=header))
|
|
||||||
|
|
||||||
# Print out alpha-version warning
|
# Print out alpha-version warning
|
||||||
sys.stderr.write("\n" + warning + "\n\n")
|
sys.stderr.write("\n" + warning + "\n\n")
|
||||||
return parsed_args, all_results
|
|
||||||
|
|
||||||
|
|
||||||
def tabulate_cli(args: Sequence[str]) -> None:
|
def tabulate_cli(args: Sequence[str]) -> None:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user