Improvements to agbench (#5776)

1. Add host network support in Docker and remove unused requirements
from argument check.
2. Use Pandas to simplify summary statistic calculations. 
3. Add running time to summary statistics

```
Using tabulation method defined in '/home/ekzhu/autogen/python/packages/agbench/benchmarks/HumanEval/Scripts/custom_tabulate.py'

    Task Id       Trial 0 Success      Trial 0 Time
--  ------------  -----------------  --------------
 0  HumanEval_0   True                            3
 1  HumanEval_1   False                          15
 2  HumanEval_2   True                            2
 3  HumanEval_3   True                           11
 4  HumanEval_4   True                            4
 5  HumanEval_5   True                            2
 6  HumanEval_6   False                          18
 7  HumanEval_7   True                            2
 8  HumanEval_8   True                            2
 9  HumanEval_9   True                           12
10  HumanEval_10  False                          11
11  HumanEval_11  True                            2
12  HumanEval_12  True                            3
13  HumanEval_13  True                            1
14  HumanEval_14  True                            4
15  HumanEval_15  True                            1
16  HumanEval_16  True                            2
17  HumanEval_17  False                          76
18  HumanEval_18  True                            4
19  HumanEval_19  True                            3
20  HumanEval_20  True                            5
21  HumanEval_21  True                            3
22  HumanEval_22  True                            1
23  HumanEval_23  True                            2
24  HumanEval_24                                nan

Summary Statistics

           Successes    Failures    Missing    Total    Average Success Rate    Average Time    Total Time
-------  -----------  ----------  ---------  -------  ----------------------  --------------  ------------
Trial 0           20           4          1       25                     0.8           7.875           189

CAUTION: 'autogenbench tabulate' is in early preview and is not thoroughly tested.
Please do not cite values from these calculations in academic work without first inspecting and verifying the results in the run logs yourself.

```

Now the default tabulate output looks like this

---------

Co-authored-by: Ryan Sweet <rysweet@microsoft.com>
This commit is contained in:
Eric Zhu 2025-03-16 09:13:12 -07:00 committed by GitHub
parent aba41d74d3
commit 483532180a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 142 additions and 100 deletions

View File

@ -0,0 +1,17 @@
from typing import List
from autogen_core.model_context import UnboundedChatCompletionContext
from autogen_core.models import AssistantMessage, LLMMessage
class ReasoningModelContext(UnboundedChatCompletionContext):
"""A model context for reasoning models."""
async def get_messages(self) -> List[LLMMessage]:
messages = await super().get_messages()
# Filter out thought field from AssistantMessage.
messages_out = []
for message in messages:
if isinstance(message, AssistantMessage):
message.thought = None
messages_out.append(message)
return messages_out

View File

@ -5,9 +5,11 @@ from autogen_ext.agents.magentic_one import MagenticOneCoderAgent
from autogen_agentchat.teams import RoundRobinGroupChat from autogen_agentchat.teams import RoundRobinGroupChat
from autogen_agentchat.ui import Console from autogen_agentchat.ui import Console
from autogen_core.models import ModelFamily from autogen_core.models import ModelFamily
from autogen_core.model_context import UnboundedChatCompletionContext, ChatCompletionContext
from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
from autogen_agentchat.conditions import TextMentionTermination from autogen_agentchat.conditions import TextMentionTermination
from custom_code_executor import CustomCodeExecutorAgent from custom_code_executor import CustomCodeExecutorAgent
from reasoning_model_context import ReasoningModelContext
from autogen_core.models import ChatCompletionClient from autogen_core.models import ChatCompletionClient
async def main() -> None: async def main() -> None:
@ -17,11 +19,20 @@ async def main() -> None:
config = yaml.safe_load(f) config = yaml.safe_load(f)
model_client = ChatCompletionClient.load_component(config["model_config"]) model_client = ChatCompletionClient.load_component(config["model_config"])
# Model context
model_context : ChatCompletionContext
if model_client.model_info["family"] == ModelFamily.R1:
model_context = ReasoningModelContext()
else:
model_context = UnboundedChatCompletionContext()
# Coder # Coder
coder_agent = MagenticOneCoderAgent( coder_agent = MagenticOneCoderAgent(
name="coder", name="coder",
model_client=model_client, model_client=model_client,
) )
# Set model context.
coder_agent._model_context = model_context # type: ignore
# Executor # Executor
executor = CustomCodeExecutorAgent( executor = CustomCodeExecutorAgent(

View File

@ -426,13 +426,17 @@ fi
# Run the scenario # Run the scenario
pip install -r requirements.txt pip install -r requirements.txt
echo SCENARIO.PY STARTING !#!# echo SCENARIO.PY STARTING !#!#
start_time=$(date +%s)
timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py
end_time=$(date +%s)
EXIT_CODE=$? EXIT_CODE=$?
if [ $EXIT_CODE -ne 0 ]; then if [ $EXIT_CODE -ne 0 ]; then
echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!# echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
else else
echo SCENARIO.PY COMPLETE !#!# echo SCENARIO.PY COMPLETE !#!#
fi fi
elapsed_time=$((end_time - start_time))
echo "SCENARIO.PY RUNTIME: $elapsed_time !#!#"
# Clean up # Clean up
if [ -d .cache ] ; then if [ -d .cache ] ; then
@ -543,13 +547,17 @@ fi
# Run the scenario # Run the scenario
pip install -r requirements.txt pip install -r requirements.txt
echo SCENARIO.PY STARTING !#!# echo SCENARIO.PY STARTING !#!#
start_time=$(date +%s)
timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py
end_time=$(date +%s)
EXIT_CODE=$? EXIT_CODE=$?
if [ $EXIT_CODE -ne 0 ]; then if [ $EXIT_CODE -ne 0 ]; then
echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!# echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
else else
echo SCENARIO.PY COMPLETE !#!# echo SCENARIO.PY COMPLETE !#!#
fi fi
elapsed_time=$((end_time - start_time))
echo "SCENARIO.PY RUNTIME: $elapsed_time !#!#"
# Clean up # Clean up
if [ -d .cache ] ; then if [ -d .cache ] ; then
@ -613,6 +621,7 @@ echo RUN.SH COMPLETE !#!#
auto_remove=True, auto_remove=True,
# Type hint of docker is wrong here # Type hint of docker is wrong here
volumes=volumes, # type: ignore volumes=volumes, # type: ignore
network="host", # Use the host network to avoid issues with localhost.
) )
# Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop. # Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
@ -930,9 +939,6 @@ def run_cli(args: Sequence[str]) -> None:
if IS_WIN32: if IS_WIN32:
sys.exit("Running scenarios with --native is not supported in Windows. Exiting.") sys.exit("Running scenarios with --native is not supported in Windows. Exiting.")
if parsed_args.requirements is not None:
sys.exit("--requirements is not compatible with --native. Exiting.")
sys.stderr.write( sys.stderr.write(
"WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n" "WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n"
) )

View File

@ -1,9 +1,10 @@
import argparse import argparse
import os import os
import re
import sys import sys
from copy import deepcopy from typing import Any, Callable, Dict, List, Optional, Sequence
from typing import Any, Callable, List, Optional, Sequence, Tuple
import pandas as pd
import tabulate as tb import tabulate as tb
from .load_module import load_module from .load_module import load_module
@ -25,6 +26,8 @@ COMPLETED_STRINGS = [
EXCLUDE_DIR_NAMES = ["__pycache__"] EXCLUDE_DIR_NAMES = ["__pycache__"]
TIMER_REGEX = r"RUNTIME:\s*([\d.]+) !#!#"
def find_tabulate_module(search_dir: str, stop_dir: Optional[str] = None) -> Optional[str]: def find_tabulate_module(search_dir: str, stop_dir: Optional[str] = None) -> Optional[str]:
"""Hunt for the tabulate script.""" """Hunt for the tabulate script."""
@ -84,12 +87,32 @@ def default_scorer(instance_dir: str, success_strings: List[str] = SUCCESS_STRIN
return None return None
def default_timer(instance_dir: str, timer_regex: str = TIMER_REGEX) -> Optional[float]:
console_log = os.path.join(instance_dir, "console_log.txt")
if os.path.isfile(console_log):
with open(console_log, "rt") as fh:
content = fh.read()
# It succeeded
m = re.search(timer_regex, content)
if m:
return float(m.group(1))
else:
return None
else:
return None
ScorerFunc = Callable[[str], Optional[bool]] ScorerFunc = Callable[[str], Optional[bool]]
TimerFunc = Callable[[str], Optional[float]]
def default_tabulate( def default_tabulate(
args: List[str], scorer: ScorerFunc = default_scorer, exclude_dir_names: List[str] = EXCLUDE_DIR_NAMES args: List[str],
) -> Tuple[argparse.Namespace, List[List[Any]]]: scorer: ScorerFunc = default_scorer,
timer: TimerFunc = default_timer,
exclude_dir_names: List[str] = EXCLUDE_DIR_NAMES,
) -> None:
invocation_cmd = args[0] invocation_cmd = args[0]
args = args[1:] args = args[1:]
@ -119,7 +142,7 @@ def default_tabulate(
parsed_args = parser.parse_args(args) parsed_args = parser.parse_args(args)
runlogs: str = parsed_args.runlogs runlogs: str = parsed_args.runlogs
all_results: List[List[Any]] = list() all_results: List[Dict[str, Any]] = list()
max_instances = 0 max_instances = 0
for task_id in sorted( for task_id in sorted(
@ -135,116 +158,101 @@ def default_tabulate(
continue continue
# Collect the results vector # Collect the results vector
results: List[Any] = [task_id] results: Dict[str, Any] = {"Task Id": task_id}
instance = 0 # Collect the results for each instance.
instance_dir = os.path.join(task_path, str(instance)) instance_dirs = sorted(
while os.path.isdir(instance_dir): os.listdir(task_path),
results.append(scorer(instance_dir)) key=lambda s: os.path.getmtime(os.path.join(task_path, s)),
instance += 1 )
instances = [int(d) for d in instance_dirs if d.isdigit()]
for instance in instances:
instance_dir = os.path.join(task_path, str(instance)) instance_dir = os.path.join(task_path, str(instance))
results[f"Trial {instance} Success"] = scorer(instance_dir)
results[f"Trial {instance} Time"] = timer(instance_dir)
max_instances = max(max_instances, instance) max_instances = max(instances)
# Buffer the results # Buffer the results
all_results.append(results) all_results.append(results)
num_instances = max_instances + 1
# Pad the results to max_instances
for result in all_results:
for i in range(num_instances):
if f"Trial {i} Success" not in result:
result[f"Trial {i} Success"] = None
if f"Trial {i} Time" not in result:
result[f"Trial {i} Time"] = None
# Create dataframe from results.
df = pd.DataFrame(all_results)
if parsed_args.csv: if parsed_args.csv:
# Create a header # Print out the dataframe in CSV format
header = ["Task Id"] print(df.to_csv(index=False))
for i in range(0, max_instances):
header.append("Trial " + str(i) + " Success")
print(",".join(header))
for row in all_results:
str_row = [f"{v}" if v is not None else "" for v in row]
while len(str_row) < max_instances + 1:
str_row.append("")
print(",".join(str_row))
# Print out alpha-version warning # Print out alpha-version warning
sys.stderr.write("\n" + warning + "\n\n") sys.stderr.write("\n" + warning + "\n\n")
else: else:
# Create a header # Tabulate the results.
header = ["\nTask Id"] print(tb.tabulate(df, headers="keys", tablefmt="simple")) # type: ignore
for i in range(0, max_instances):
header.append("Trial " + str(i) + "\nSuccess")
# Create the footer # Aggregate statistics for all tasks for each trials.
def _count_equals(value: Optional[bool], trial: int) -> int: print("\nSummary Statistics\n")
count = 0 score_columns = ["Trial " + str(i) + " Success" for i in range(num_instances)]
for row in all_results: # Count the number of successes when the value is True.
is_answer_matched = row[trial + 1][0] if isinstance(row[trial + 1], tuple) else row[trial + 1] successes = df[score_columns].apply(lambda x: x is True).sum(axis=0) # type: ignore
# Count the number of failures when the value is False.
failures: pd.Series = df[score_columns].apply(lambda x: x is False).sum(axis=0) # type: ignore
# Count the number of missing
missings = df[score_columns].isna().sum(axis=0) # type: ignore
# Count the total number of instances
totals = successes + failures + missings # type: ignore
# Calculate the average success rates
avg_success_rates = successes / (successes + failures) # type: ignore
time_columns = ["Trial " + str(i) + " Time" for i in range(num_instances)] # type: ignore
# Count the total time of non-null values
total_times = df[time_columns].sum(axis=0, skipna=True) # type: ignore
# Calculate the average time of non-null values
avg_times = df[time_columns].mean(axis=0, skipna=True) # type: ignore
# Count missing # Create a per-trial summary dataframe
if value is None: trial_df = pd.DataFrame(
if trial + 1 < len(row): {
if is_answer_matched is None: "Successes": list(successes), # type: ignore
count += 1 "Failures": list(failures), # type: ignore
else: "Missing": list(missings), # type: ignore
count += 1 "Total": list(totals), # type: ignore
# Count match "Average Success Rate": list(avg_success_rates), # type: ignore
elif trial + 1 < len(row) and is_answer_matched == value: "Average Time": list(avg_times), # type: ignore
count += 1 "Total Time": list(total_times), # type: ignore
return count },
index=[f"Trial {i}" for i in range(num_instances)],
)
# Print out the per-trial summary dataframe.
print(tb.tabulate(trial_df, headers="keys", tablefmt="simple")) # type: ignore
footer: List[Any] = [] # Aggregate statistics across tasks for all trials.
footer_row: List[Any] = ["Successes"] # At least one success for each trial, averaged across tasks.
for i in range(0, max_instances): average_at_least_one_success = df[score_columns].any(axis=1).mean(skipna=True) # type: ignore
footer_row.append(_count_equals(True, i)) # All successes for each trial
footer.append(footer_row) average_all_successes = df[score_columns].all(axis=1).mean(skipna=True) # type: ignore
footer_row = ["Failures"] # Create a dataframe
for i in range(0, max_instances): trial_aggregated_df = pd.DataFrame(
# count how many are not True, and not None, could be False or any other value {
failures = 0 "At Least One Success": [average_at_least_one_success], # type: ignore
for row in all_results: "All Successes": [average_all_successes], # type: ignore
if isinstance(row[i + 1], tuple): },
failures += row[i + 1][0] not in [1, None] index=["Trial Aggregated"],
else: )
failures += row[i + 1] not in [1, None] # Print out the trial-aggregated dataframe.
footer_row.append(failures) print(tb.tabulate(trial_aggregated_df, headers="keys", tablefmt="simple")) # type: ignore
footer.append(footer_row)
footer_row = ["Missing"]
for i in range(0, max_instances):
footer_row.append(_count_equals(None, i))
footer.append(footer_row)
footer_row = ["Total"]
for i in range(0, max_instances):
footer_row.append(footer[0][i + 1] + footer[1][i + 1] + footer[2][i + 1])
footer.append(footer_row)
footer_row = ["Average Success Rate"]
for i in range(0, max_instances):
footer_row.append(_count_equals(True, i) / (footer[0][i + 1] + footer[1][i + 1] + footer[2][i + 1]))
footer.append(footer_row)
footer_row = ["Average Score"]
for i in range(0, max_instances):
avg_score_trial = 0.0
for row in all_results:
if isinstance(row[i + 1], tuple):
avg_score_trial += row[i + 1][0]
avg_score_trial = avg_score_trial / len(all_results)
footer_row.append(avg_score_trial)
footer.append(footer_row)
table = deepcopy(all_results)
for row in table:
for trial in range(0, max_instances):
if isinstance(row[trial + 1], tuple):
row[trial + 1] = row[trial + 1][0]
table.append(tb.SEPARATING_LINE) # type: ignore
table.extend(footer)
print(tb.tabulate(table, headers=header))
# Print out alpha-version warning # Print out alpha-version warning
sys.stderr.write("\n" + warning + "\n\n") sys.stderr.write("\n" + warning + "\n\n")
return parsed_args, all_results
def tabulate_cli(args: Sequence[str]) -> None: def tabulate_cli(args: Sequence[str]) -> None: