autogen/samples/tools/testbed/utils/metrics_human_eval.py

import os
import sys
import argparse
import csv


def metrics(results_fh):
    """
    Compute metrics from collated HumanEval results.

    Args:
        results_fh (File Stream): A file stream containing the collated results in CSV.
    """

    reader = csv.reader(results_fh)
    first_row = next(reader)  # Read the first line

    num_trials = len(first_row) - 1  # Don't count the first column (TestId)
    max_turns = 0
    num_rows = 0

    # Load the results. We'll need to iterate over them a few times.
    results = list()
    for row in reader:
        num_rows += 1

        name = row[0]
        trials = [(None if v.strip() == "" else int(v)) for v in row[1:]]
        for v in trials:
            if v is not None:
                max_turns = max(max_turns, v)
        results.append([name, trials])

    # Print the header
    header = ["Trial"]
    for i in range(1, max_turns + 1):
        header.append("cumulative_passes_by_turn_" + str(i))
    header.append("fails")
    header.append("missing")
    print(",".join(header))

    # Compute the metrics
    def _metrics_for_trial(t):
        counts = [None]
        fails = 0
        missing = 0

        # Compute cumulative passes for each conversation turn
        for i in range(1, max_turns + 1):
            counts.append(0)
            assert len(counts) == i + 1

            for r in results:
                v = r[1][t]
                if v is not None:
                    v = int(v)
                    if 0 <= v and v <= i:
                        counts[i] += 1

        # Count missing and failed
        for r in results:
            v = r[1][t]
            if v is None:
                missing += 1
            elif int(v) < 0:
                fails += 1

        # Prepare the row in the format specified by the header
        return str(t) + "," + ",".join([str(v) for v in counts[1:]]) + "," + str(fails) + "," + str(missing)

    # Print each row
    for t in range(0, num_trials):
        print(_metrics_for_trial(t))


###############################################################################
if __name__ == "__main__":
    script_path = os.path.realpath(__file__)
    script_name = os.path.basename(script_path)
    script_dir = os.path.dirname(script_path)

    parser = argparse.ArgumentParser(
        description=f"""
{script_name} will compute metrics on the collated results of the HumanEval scenarios. Use collate_human_eval.py to prepare input to this script.

The output will be formatted as a CSV with the following schema:

Trial, cumulative_passes_by_turn_1, ..., cumulative_passes_by_turn_N, fails, missing
0      x_01,                             x_0N,                        y_0,   z_0
1      x_11,                             x_1N,                        y_1,   z_1
...
M      x_M1,                             x_MN,                        y_M,   z_M

Where:

  x_ij is the number of HumanEval problems in Trial i that achieved a passing result by conversation turn j.
  y_i  is the number of HumanEval problems in Trial i that never achieved a passing result (they failed).
  z_i  is the number of HumanEval problems in Trial i that have missing data.

""".strip(),
        formatter_class=argparse.RawTextHelpFormatter,
    )

    parser.add_argument(
        "scenario",
        nargs="?",
        help="Path to collated results. If '-' or omitted, read from stdin. (default: '-')",
        default="-",
    )
    args = parser.parse_args()

    if args.scenario == "" or args.scenario == "-":
        metrics(sys.stdin)
    else:
        with open(args.scenario, "rt") as fh:
            metrics(fh)