autogen/samples/tools/testbed/utils/metrics_human_eval.py
afourney 1c4a5e6a1a
Added a simple Testbed tool for repeatedly running templated Autogen scenarios with tightly-controlled initial conditions. (#455)
* Initial commit of the autogen testbed environment.

* Fixed some typos in the Testbed README.md

* Added some stricter termination logic to the two_agent scenario, and swiched the logo task from finding Autogen's logo, to finding Microsoft's (it's easier)

* Added documentation to testbed code in preparation for PR

* Added a variation of HumanEval to the Testbed. It is also a reasonable example of how to integrate other benchmarks.

* Removed ChatCompletion.start_logging and related features. Added an explicit TERMINATE output to HumanEval to save 1 turn in each conversation.

* Added metrics utils script for HumanEval

* Updated the requirements in the README.

* Added documentation for HumanEval csv schemas

* Standardized on how the OAI_CONFIG_LIST is handled.

* Removed dot-slash from 'includes' path for cross-platform compatibility

* Missed a file.

* Updated readme to include known-working versions.
2023-11-04 10:38:43 +00:00

117 lines
3.5 KiB
Python

import os
import sys
import argparse
import csv
def metrics(results_fh):
"""
Compute metrics from collated HumanEval results.
Args:
results_fh (File Stream): A file stream containing the collated results in CSV.
"""
reader = csv.reader(results_fh)
first_row = next(reader) # Read the first line
num_trials = len(first_row) - 1 # Don't count the first column (TestId)
max_turns = 0
num_rows = 0
# Load the results. We'll need to iterate over them a few times.
results = list()
for row in reader:
num_rows += 1
name = row[0]
trials = [(None if v.strip() == "" else int(v)) for v in row[1:]]
for v in trials:
if v is not None:
max_turns = max(max_turns, v)
results.append([name, trials])
# Print the header
header = ["Trial"]
for i in range(1, max_turns + 1):
header.append("cumulative_passes_by_turn_" + str(i))
header.append("fails")
header.append("missing")
print(",".join(header))
# Compute the metrics
def _metrics_for_trial(t):
counts = [None]
fails = 0
missing = 0
# Compute cumulative passes for each conversation turn
for i in range(1, max_turns + 1):
counts.append(0)
assert len(counts) == i + 1
for r in results:
v = r[1][t]
if v is not None:
v = int(v)
if 0 <= v and v <= i:
counts[i] += 1
# Count missing and failed
for r in results:
v = r[1][t]
if v is None:
missing += 1
elif int(v) < 0:
fails += 1
# Prepare the row in the format specified by the header
return str(t) + "," + ",".join([str(v) for v in counts[1:]]) + "," + str(fails) + "," + str(missing)
# Print each row
for t in range(0, num_trials):
print(_metrics_for_trial(t))
###############################################################################
if __name__ == "__main__":
script_path = os.path.realpath(__file__)
script_name = os.path.basename(script_path)
script_dir = os.path.dirname(script_path)
parser = argparse.ArgumentParser(
description=f"""
{script_name} will compute metrics on the collated results of the HumanEval scenarios. Use collate_human_eval.py to prepare input to this script.
The output will be formatted as a CSV with the following schema:
Trial, cumulative_passes_by_turn_1, ..., cumulative_passes_by_turn_N, fails, missing
0 x_01, x_0N, y_0, z_0
1 x_11, x_1N, y_1, z_1
...
M x_M1, x_MN, y_M, z_M
Where:
x_ij is the number of HumanEval problems in Trial i that achieved a passing result by conversation turn j.
y_i is the number of HumanEval problems in Trial i that never achieved a passing result (they failed).
z_i is the number of HumanEval problems in Trial i that have missing data.
""".strip(),
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
"scenario",
nargs="?",
help="Path to collated results. If '-' or omitted, read from stdin. (default: '-')",
default="-",
)
args = parser.parse_args()
if args.scenario == "" or args.scenario == "-":
metrics(sys.stdin)
else:
with open(args.scenario, "rt") as fh:
metrics(fh)