autogen/samples/tools/testbed/utils/metrics_gaia.py

import os
import sys
import argparse
import csv


def metrics(results_fh):
    """
    Compute metrics from collated GAIA results.

    Args:
        results_fh (File Stream): A file stream containing the collated results in CSV.
    """

    reader = csv.reader(results_fh)
    first_row = next(reader)  # Read the first line

    num_trials = len(first_row) - 1  # Don't count the first column (TestId)

    # Set up the counters
    counters = []
    for i in range(0, num_trials):
        counters.append({"successes": 0, "failures": 0, "missing": 0})

    # Load the results. We'll need to iterate over them a few times.
    results = list()
    for row in reader:
        name = row[0]
        trials = [(None if v.strip() == "" else int(v)) for v in row[1:]]
        for i in range(0, len(trials)):
            v = trials[i]
            if v is None:
                counters[i]["missing"] += 1
            elif v > 0:
                counters[i]["successes"] += 1
            else:
                counters[i]["failures"] += 1

        results.append([name, trials])

    def _safe_div(num, denom):
        if denom == 0:
            return ""
        else:
            return num / denom

    # Print the header
    for i in range(0, len(counters)):
        counter = counters[i]
        n = counter["successes"] + counter["failures"] + counter["missing"]
        score = _safe_div(counter["successes"], n)
        print(f"{i},{n},{counter['successes']},{counter['failures']},{counter['missing']},{score}")


###############################################################################
if __name__ == "__main__":
    script_path = os.path.realpath(__file__)
    script_name = os.path.basename(script_path)
    script_dir = os.path.dirname(script_path)

    parser = argparse.ArgumentParser(
        description=f"""
{script_name} will compute metrics on the collated results of the GAIA scenarios. Use collate_gaia.py to prepare input to this script.

The output will be formatted as a CSV with the following schema:

Trial,  n,      successes,  failures,   missing,    score
0       N_0,    s_0         f_0         m_0,        p_0
0       N_1,    s_1         f_1         m_1,        p_1
...
M       N_M,    s_M         f_M         m_M,        p_M

Where:

    N_i is the number of questions in trial i
    s_i is the number of successes in trial i
    f_i is the number of failures in trial i
    m_i is the number of missing values in trial i
    p_i is the proportion of successes in trail i (i.e, s_i / N_i)

""".strip(),
        formatter_class=argparse.RawTextHelpFormatter,
    )

    parser.add_argument(
        "scenario",
        nargs="?",
        help="Path to collated results. If '-' or omitted, read from stdin. (default: '-')",
        default="-",
    )
    args = parser.parse_args()

    if args.scenario == "" or args.scenario == "-":
        metrics(sys.stdin)
    else:
        with open(args.scenario, "rt") as fh:
            metrics(fh)
Adds the GAIA benchark to the Testbed. This PR depends on #792 (#810) * Re-added completion logging when using older versions of autogen. * Extended scenario definitions and templating to include folders. * Prepare collate_human_eval.py for working with group chat scenarios. * Converted HumanEval to the folder-based approach, and added GroupChat scenarios. * Fixed the default termination message. * Fixed another termination condition. * Updated compatible autogen versions. * Added initial support for GAIA benchmark. * Fixed a bug in executing the finalize scripts. * Generalized the template further to support multiple folder copy operations. * Refined GAIA support, and broke scenarios down by difficulty. * Added some experimental scripts for computing metrics over GAIA. This is a first version, and will likely need refinement. * Added instructions for cloning GAIA * Updated README to fix some typos. * Added a script to format GAIA reslts for the leaderboard. * Update samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py Co-authored-by: LeoLjl <3110503618@qq.com> --------- Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu> Co-authored-by: LeoLjl <3110503618@qq.com> 2023-12-05 17:46:10 -08:00			`import os`
			`import sys`
			`import argparse`
			`import csv`


			`def metrics(results_fh):`
			`"""`
			`Compute metrics from collated GAIA results.`

			`Args:`
			`results_fh (File Stream): A file stream containing the collated results in CSV.`
			`"""`

			`reader = csv.reader(results_fh)`
			`first_row = next(reader) # Read the first line`

			`num_trials = len(first_row) - 1 # Don't count the first column (TestId)`

			`# Set up the counters`
			`counters = []`
			`for i in range(0, num_trials):`
			`counters.append({"successes": 0, "failures": 0, "missing": 0})`

			`# Load the results. We'll need to iterate over them a few times.`
			`results = list()`
			`for row in reader:`
			`name = row[0]`
			`trials = [(None if v.strip() == "" else int(v)) for v in row[1:]]`
			`for i in range(0, len(trials)):`
			`v = trials[i]`
			`if v is None:`
			`counters[i]["missing"] += 1`
			`elif v > 0:`
			`counters[i]["successes"] += 1`
			`else:`
			`counters[i]["failures"] += 1`

			`results.append([name, trials])`

			`def _safe_div(num, denom):`
			`if denom == 0:`
			`return ""`
			`else:`
			`return num / denom`

			`# Print the header`
			`for i in range(0, len(counters)):`
			`counter = counters[i]`
			`n = counter["successes"] + counter["failures"] + counter["missing"]`
			`score = _safe_div(counter["successes"], n)`
			`print(f"{i},{n},{counter['successes']},{counter['failures']},{counter['missing']},{score}")`


			`###############################################################################`
			`if __name__ == "__main__":`
			`script_path = os.path.realpath(__file__)`
			`script_name = os.path.basename(script_path)`
			`script_dir = os.path.dirname(script_path)`

			`parser = argparse.ArgumentParser(`
			`description=f"""`
			`{script_name} will compute metrics on the collated results of the GAIA scenarios. Use collate_gaia.py to prepare input to this script.`

			`The output will be formatted as a CSV with the following schema:`

			`Trial, n, successes, failures, missing, score`
			`0 N_0, s_0 f_0 m_0, p_0`
			`0 N_1, s_1 f_1 m_1, p_1`
			`...`
			`M N_M, s_M f_M m_M, p_M`

			`Where:`

			`N_i is the number of questions in trial i`
			`s_i is the number of successes in trial i`
			`f_i is the number of failures in trial i`
			`m_i is the number of missing values in trial i`
			`p_i is the proportion of successes in trail i (i.e, s_i / N_i)`

			`""".strip(),`
			`formatter_class=argparse.RawTextHelpFormatter,`
			`)`

			`parser.add_argument(`
			`"scenario",`
			`nargs="?",`
			`help="Path to collated results. If '-' or omitted, read from stdin. (default: '-')",`
			`default="-",`
			`)`
			`args = parser.parse_args()`

			`if args.scenario == "" or args.scenario == "-":`
			`metrics(sys.stdin)`
			`else:`
			`with open(args.scenario, "rt") as fh:`
			`metrics(fh)`