mirror of
https://github.com/microsoft/autogen.git
synced 2025-08-11 18:21:31 +00:00
117 lines
3.5 KiB
Python
117 lines
3.5 KiB
Python
![]() |
import os
|
||
|
import sys
|
||
|
import argparse
|
||
|
import csv
|
||
|
|
||
|
|
||
|
def metrics(results_fh):
|
||
|
"""
|
||
|
Compute metrics from collated HumanEval results.
|
||
|
|
||
|
Args:
|
||
|
results_fh (File Stream): A file stream containing the collated results in CSV.
|
||
|
"""
|
||
|
|
||
|
reader = csv.reader(results_fh)
|
||
|
first_row = next(reader) # Read the first line
|
||
|
|
||
|
num_trials = len(first_row) - 1 # Don't count the first column (TestId)
|
||
|
max_turns = 0
|
||
|
num_rows = 0
|
||
|
|
||
|
# Load the results. We'll need to iterate over them a few times.
|
||
|
results = list()
|
||
|
for row in reader:
|
||
|
num_rows += 1
|
||
|
|
||
|
name = row[0]
|
||
|
trials = [(None if v.strip() == "" else int(v)) for v in row[1:]]
|
||
|
for v in trials:
|
||
|
if v is not None:
|
||
|
max_turns = max(max_turns, v)
|
||
|
results.append([name, trials])
|
||
|
|
||
|
# Print the header
|
||
|
header = ["Trial"]
|
||
|
for i in range(1, max_turns + 1):
|
||
|
header.append("cumulative_passes_by_turn_" + str(i))
|
||
|
header.append("fails")
|
||
|
header.append("missing")
|
||
|
print(",".join(header))
|
||
|
|
||
|
# Compute the metrics
|
||
|
def _metrics_for_trial(t):
|
||
|
counts = [None]
|
||
|
fails = 0
|
||
|
missing = 0
|
||
|
|
||
|
# Compute cumulative passes for each conversation turn
|
||
|
for i in range(1, max_turns + 1):
|
||
|
counts.append(0)
|
||
|
assert len(counts) == i + 1
|
||
|
|
||
|
for r in results:
|
||
|
v = r[1][t]
|
||
|
if v is not None:
|
||
|
v = int(v)
|
||
|
if 0 <= v and v <= i:
|
||
|
counts[i] += 1
|
||
|
|
||
|
# Count missing and failed
|
||
|
for r in results:
|
||
|
v = r[1][t]
|
||
|
if v is None:
|
||
|
missing += 1
|
||
|
elif int(v) < 0:
|
||
|
fails += 1
|
||
|
|
||
|
# Prepare the row in the format specified by the header
|
||
|
return str(t) + "," + ",".join([str(v) for v in counts[1:]]) + "," + str(fails) + "," + str(missing)
|
||
|
|
||
|
# Print each row
|
||
|
for t in range(0, num_trials):
|
||
|
print(_metrics_for_trial(t))
|
||
|
|
||
|
|
||
|
###############################################################################
|
||
|
if __name__ == "__main__":
|
||
|
script_path = os.path.realpath(__file__)
|
||
|
script_name = os.path.basename(script_path)
|
||
|
script_dir = os.path.dirname(script_path)
|
||
|
|
||
|
parser = argparse.ArgumentParser(
|
||
|
description=f"""
|
||
|
{script_name} will compute metrics on the collated results of the HumanEval scenarios. Use collate_human_eval.py to prepare input to this script.
|
||
|
|
||
|
The output will be formatted as a CSV with the following schema:
|
||
|
|
||
|
Trial, cumulative_passes_by_turn_1, ..., cumulative_passes_by_turn_N, fails, missing
|
||
|
0 x_01, x_0N, y_0, z_0
|
||
|
1 x_11, x_1N, y_1, z_1
|
||
|
...
|
||
|
M x_M1, x_MN, y_M, z_M
|
||
|
|
||
|
Where:
|
||
|
|
||
|
x_ij is the number of HumanEval problems in Trial i that achieved a passing result by conversation turn j.
|
||
|
y_i is the number of HumanEval problems in Trial i that never achieved a passing result (they failed).
|
||
|
z_i is the number of HumanEval problems in Trial i that have missing data.
|
||
|
|
||
|
""".strip(),
|
||
|
formatter_class=argparse.RawTextHelpFormatter,
|
||
|
)
|
||
|
|
||
|
parser.add_argument(
|
||
|
"scenario",
|
||
|
nargs="?",
|
||
|
help="Path to collated results. If '-' or omitted, read from stdin. (default: '-')",
|
||
|
default="-",
|
||
|
)
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
if args.scenario == "" or args.scenario == "-":
|
||
|
metrics(sys.stdin)
|
||
|
else:
|
||
|
with open(args.scenario, "rt") as fh:
|
||
|
metrics(fh)
|