mirror of
https://github.com/microsoft/autogen.git
synced 2025-08-12 10:41:13 +00:00
98 lines
2.9 KiB
Python
98 lines
2.9 KiB
Python
![]() |
import os
|
||
|
import sys
|
||
|
import argparse
|
||
|
import csv
|
||
|
|
||
|
|
||
|
def metrics(results_fh):
|
||
|
"""
|
||
|
Compute metrics from collated GAIA results.
|
||
|
|
||
|
Args:
|
||
|
results_fh (File Stream): A file stream containing the collated results in CSV.
|
||
|
"""
|
||
|
|
||
|
reader = csv.reader(results_fh)
|
||
|
first_row = next(reader) # Read the first line
|
||
|
|
||
|
num_trials = len(first_row) - 1 # Don't count the first column (TestId)
|
||
|
|
||
|
# Set up the counters
|
||
|
counters = []
|
||
|
for i in range(0, num_trials):
|
||
|
counters.append({"successes": 0, "failures": 0, "missing": 0})
|
||
|
|
||
|
# Load the results. We'll need to iterate over them a few times.
|
||
|
results = list()
|
||
|
for row in reader:
|
||
|
name = row[0]
|
||
|
trials = [(None if v.strip() == "" else int(v)) for v in row[1:]]
|
||
|
for i in range(0, len(trials)):
|
||
|
v = trials[i]
|
||
|
if v is None:
|
||
|
counters[i]["missing"] += 1
|
||
|
elif v > 0:
|
||
|
counters[i]["successes"] += 1
|
||
|
else:
|
||
|
counters[i]["failures"] += 1
|
||
|
|
||
|
results.append([name, trials])
|
||
|
|
||
|
def _safe_div(num, denom):
|
||
|
if denom == 0:
|
||
|
return ""
|
||
|
else:
|
||
|
return num / denom
|
||
|
|
||
|
# Print the header
|
||
|
for i in range(0, len(counters)):
|
||
|
counter = counters[i]
|
||
|
n = counter["successes"] + counter["failures"] + counter["missing"]
|
||
|
score = _safe_div(counter["successes"], n)
|
||
|
print(f"{i},{n},{counter['successes']},{counter['failures']},{counter['missing']},{score}")
|
||
|
|
||
|
|
||
|
###############################################################################
|
||
|
if __name__ == "__main__":
|
||
|
script_path = os.path.realpath(__file__)
|
||
|
script_name = os.path.basename(script_path)
|
||
|
script_dir = os.path.dirname(script_path)
|
||
|
|
||
|
parser = argparse.ArgumentParser(
|
||
|
description=f"""
|
||
|
{script_name} will compute metrics on the collated results of the GAIA scenarios. Use collate_gaia.py to prepare input to this script.
|
||
|
|
||
|
The output will be formatted as a CSV with the following schema:
|
||
|
|
||
|
Trial, n, successes, failures, missing, score
|
||
|
0 N_0, s_0 f_0 m_0, p_0
|
||
|
0 N_1, s_1 f_1 m_1, p_1
|
||
|
...
|
||
|
M N_M, s_M f_M m_M, p_M
|
||
|
|
||
|
Where:
|
||
|
|
||
|
N_i is the number of questions in trial i
|
||
|
s_i is the number of successes in trial i
|
||
|
f_i is the number of failures in trial i
|
||
|
m_i is the number of missing values in trial i
|
||
|
p_i is the proportion of successes in trail i (i.e, s_i / N_i)
|
||
|
|
||
|
""".strip(),
|
||
|
formatter_class=argparse.RawTextHelpFormatter,
|
||
|
)
|
||
|
|
||
|
parser.add_argument(
|
||
|
"scenario",
|
||
|
nargs="?",
|
||
|
help="Path to collated results. If '-' or omitted, read from stdin. (default: '-')",
|
||
|
default="-",
|
||
|
)
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
if args.scenario == "" or args.scenario == "-":
|
||
|
metrics(sys.stdin)
|
||
|
else:
|
||
|
with open(args.scenario, "rt") as fh:
|
||
|
metrics(fh)
|