autogen/samples/tools/testbed/utils/metrics_gaia.py
afourney f8b4b4259b
Adds the GAIA benchark to the Testbed. This PR depends on #792 (#810)
* Re-added completion logging when using older versions of autogen.

* Extended scenario definitions and templating to include folders.

* Prepare collate_human_eval.py for working with group chat scenarios.

* Converted HumanEval to the folder-based approach, and added GroupChat scenarios.

* Fixed the default termination message.

* Fixed another termination condition.

* Updated compatible autogen versions.

* Added initial support for GAIA benchmark.

* Fixed a bug in executing the finalize scripts.

* Generalized the template further to support multiple folder copy operations.

* Refined GAIA support, and broke scenarios down by difficulty.

* Added some experimental scripts for computing metrics over GAIA. This is a first version, and will likely need refinement.

* Added instructions for cloning GAIA

* Updated README to fix some typos.

* Added a script to format GAIA reslts for the leaderboard.

* Update samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py

Co-authored-by: LeoLjl <3110503618@qq.com>

---------

Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu>
Co-authored-by: LeoLjl <3110503618@qq.com>
2023-12-06 01:46:10 +00:00

98 lines
2.9 KiB
Python

import os
import sys
import argparse
import csv
def metrics(results_fh):
"""
Compute metrics from collated GAIA results.
Args:
results_fh (File Stream): A file stream containing the collated results in CSV.
"""
reader = csv.reader(results_fh)
first_row = next(reader) # Read the first line
num_trials = len(first_row) - 1 # Don't count the first column (TestId)
# Set up the counters
counters = []
for i in range(0, num_trials):
counters.append({"successes": 0, "failures": 0, "missing": 0})
# Load the results. We'll need to iterate over them a few times.
results = list()
for row in reader:
name = row[0]
trials = [(None if v.strip() == "" else int(v)) for v in row[1:]]
for i in range(0, len(trials)):
v = trials[i]
if v is None:
counters[i]["missing"] += 1
elif v > 0:
counters[i]["successes"] += 1
else:
counters[i]["failures"] += 1
results.append([name, trials])
def _safe_div(num, denom):
if denom == 0:
return ""
else:
return num / denom
# Print the header
for i in range(0, len(counters)):
counter = counters[i]
n = counter["successes"] + counter["failures"] + counter["missing"]
score = _safe_div(counter["successes"], n)
print(f"{i},{n},{counter['successes']},{counter['failures']},{counter['missing']},{score}")
###############################################################################
if __name__ == "__main__":
script_path = os.path.realpath(__file__)
script_name = os.path.basename(script_path)
script_dir = os.path.dirname(script_path)
parser = argparse.ArgumentParser(
description=f"""
{script_name} will compute metrics on the collated results of the GAIA scenarios. Use collate_gaia.py to prepare input to this script.
The output will be formatted as a CSV with the following schema:
Trial, n, successes, failures, missing, score
0 N_0, s_0 f_0 m_0, p_0
0 N_1, s_1 f_1 m_1, p_1
...
M N_M, s_M f_M m_M, p_M
Where:
N_i is the number of questions in trial i
s_i is the number of successes in trial i
f_i is the number of failures in trial i
m_i is the number of missing values in trial i
p_i is the proportion of successes in trail i (i.e, s_i / N_i)
""".strip(),
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
"scenario",
nargs="?",
help="Path to collated results. If '-' or omitted, read from stdin. (default: '-')",
default="-",
)
args = parser.parse_args()
if args.scenario == "" or args.scenario == "-":
metrics(sys.stdin)
else:
with open(args.scenario, "rt") as fh:
metrics(fh)