mirror of
https://github.com/microsoft/autogen.git
synced 2025-08-08 00:32:03 +00:00

* Initial commit of the autogen testbed environment. * Fixed some typos in the Testbed README.md * Added some stricter termination logic to the two_agent scenario, and swiched the logo task from finding Autogen's logo, to finding Microsoft's (it's easier) * Added documentation to testbed code in preparation for PR * Added a variation of HumanEval to the Testbed. It is also a reasonable example of how to integrate other benchmarks. * Removed ChatCompletion.start_logging and related features. Added an explicit TERMINATE output to HumanEval to save 1 turn in each conversation. * Added metrics utils script for HumanEval * Updated the requirements in the README. * Added documentation for HumanEval csv schemas * Standardized on how the OAI_CONFIG_LIST is handled. * Removed dot-slash from 'includes' path for cross-platform compatibility * Missed a file. * Updated readme to include known-working versions.
117 lines
3.5 KiB
Python
117 lines
3.5 KiB
Python
import os
|
|
import sys
|
|
import argparse
|
|
import csv
|
|
|
|
|
|
def metrics(results_fh):
|
|
"""
|
|
Compute metrics from collated HumanEval results.
|
|
|
|
Args:
|
|
results_fh (File Stream): A file stream containing the collated results in CSV.
|
|
"""
|
|
|
|
reader = csv.reader(results_fh)
|
|
first_row = next(reader) # Read the first line
|
|
|
|
num_trials = len(first_row) - 1 # Don't count the first column (TestId)
|
|
max_turns = 0
|
|
num_rows = 0
|
|
|
|
# Load the results. We'll need to iterate over them a few times.
|
|
results = list()
|
|
for row in reader:
|
|
num_rows += 1
|
|
|
|
name = row[0]
|
|
trials = [(None if v.strip() == "" else int(v)) for v in row[1:]]
|
|
for v in trials:
|
|
if v is not None:
|
|
max_turns = max(max_turns, v)
|
|
results.append([name, trials])
|
|
|
|
# Print the header
|
|
header = ["Trial"]
|
|
for i in range(1, max_turns + 1):
|
|
header.append("cumulative_passes_by_turn_" + str(i))
|
|
header.append("fails")
|
|
header.append("missing")
|
|
print(",".join(header))
|
|
|
|
# Compute the metrics
|
|
def _metrics_for_trial(t):
|
|
counts = [None]
|
|
fails = 0
|
|
missing = 0
|
|
|
|
# Compute cumulative passes for each conversation turn
|
|
for i in range(1, max_turns + 1):
|
|
counts.append(0)
|
|
assert len(counts) == i + 1
|
|
|
|
for r in results:
|
|
v = r[1][t]
|
|
if v is not None:
|
|
v = int(v)
|
|
if 0 <= v and v <= i:
|
|
counts[i] += 1
|
|
|
|
# Count missing and failed
|
|
for r in results:
|
|
v = r[1][t]
|
|
if v is None:
|
|
missing += 1
|
|
elif int(v) < 0:
|
|
fails += 1
|
|
|
|
# Prepare the row in the format specified by the header
|
|
return str(t) + "," + ",".join([str(v) for v in counts[1:]]) + "," + str(fails) + "," + str(missing)
|
|
|
|
# Print each row
|
|
for t in range(0, num_trials):
|
|
print(_metrics_for_trial(t))
|
|
|
|
|
|
###############################################################################
|
|
if __name__ == "__main__":
|
|
script_path = os.path.realpath(__file__)
|
|
script_name = os.path.basename(script_path)
|
|
script_dir = os.path.dirname(script_path)
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description=f"""
|
|
{script_name} will compute metrics on the collated results of the HumanEval scenarios. Use collate_human_eval.py to prepare input to this script.
|
|
|
|
The output will be formatted as a CSV with the following schema:
|
|
|
|
Trial, cumulative_passes_by_turn_1, ..., cumulative_passes_by_turn_N, fails, missing
|
|
0 x_01, x_0N, y_0, z_0
|
|
1 x_11, x_1N, y_1, z_1
|
|
...
|
|
M x_M1, x_MN, y_M, z_M
|
|
|
|
Where:
|
|
|
|
x_ij is the number of HumanEval problems in Trial i that achieved a passing result by conversation turn j.
|
|
y_i is the number of HumanEval problems in Trial i that never achieved a passing result (they failed).
|
|
z_i is the number of HumanEval problems in Trial i that have missing data.
|
|
|
|
""".strip(),
|
|
formatter_class=argparse.RawTextHelpFormatter,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"scenario",
|
|
nargs="?",
|
|
help="Path to collated results. If '-' or omitted, read from stdin. (default: '-')",
|
|
default="-",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if args.scenario == "" or args.scenario == "-":
|
|
metrics(sys.stdin)
|
|
else:
|
|
with open(args.scenario, "rt") as fh:
|
|
metrics(fh)
|