2023-12-05 17:46:10 -08:00
|
|
|
import os
|
|
|
|
import json
|
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
import argparse
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_answer(a):
|
|
|
|
# Lower case
|
|
|
|
# Trim (left and right)
|
|
|
|
# Replace multiple spaces with one space
|
|
|
|
# Remove trailing punctuation
|
|
|
|
return re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", a.strip().lower()))
|
|
|
|
|
|
|
|
|
|
|
|
def collate(results_dir):
|
|
|
|
"""
|
|
|
|
Collate the results of running GAIA
|
|
|
|
|
|
|
|
Args:
|
|
|
|
results_dir (path): The folder were results were be saved.
|
|
|
|
"""
|
|
|
|
|
|
|
|
all_results = list()
|
|
|
|
max_instances = 0
|
|
|
|
|
|
|
|
for test_id in os.listdir(results_dir):
|
|
|
|
test_path = os.path.join(results_dir, test_id)
|
|
|
|
|
2023-12-23 00:00:46 +08:00
|
|
|
# Collect the results vector
|
2023-12-05 17:46:10 -08:00
|
|
|
results = [test_id]
|
|
|
|
|
|
|
|
instance = 0
|
|
|
|
instance_dir = os.path.join(test_path, str(instance))
|
|
|
|
while os.path.isdir(instance_dir):
|
|
|
|
expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
|
|
|
|
if not os.path.isfile(expected_answer_file):
|
2023-12-23 00:00:46 +08:00
|
|
|
# Expected answer is missing
|
2023-12-05 17:46:10 -08:00
|
|
|
results.append("")
|
|
|
|
|
|
|
|
instance += 1
|
|
|
|
instance_dir = os.path.join(test_path, str(instance))
|
|
|
|
continue
|
|
|
|
|
|
|
|
expected_answer = "!!!NULL ANSWER!!!"
|
|
|
|
with open(expected_answer_file, "rt") as fh:
|
|
|
|
expected_answer = fh.read().strip()
|
|
|
|
|
|
|
|
console_log_file = os.path.join(instance_dir, "console_log.txt")
|
|
|
|
if not os.path.isfile(console_log_file):
|
|
|
|
# Console log file missing
|
|
|
|
results.append("")
|
|
|
|
|
|
|
|
instance += 1
|
|
|
|
instance_dir = os.path.join(test_path, str(instance))
|
|
|
|
continue
|
|
|
|
|
|
|
|
with open(console_log_file, "rt") as fh:
|
|
|
|
console_log = fh.read()
|
|
|
|
|
|
|
|
final_answer = ""
|
|
|
|
m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
|
|
|
|
if m:
|
|
|
|
final_answer = m.group(1).strip()
|
|
|
|
|
|
|
|
# print(f"Expected Answer: {expected_answer}\nAutogen Answer: {final_answer}\n")
|
|
|
|
|
|
|
|
if normalize_answer(expected_answer) == normalize_answer(final_answer):
|
|
|
|
results.append("1")
|
|
|
|
else:
|
|
|
|
results.append("-1")
|
|
|
|
|
|
|
|
instance += 1
|
|
|
|
instance_dir = os.path.join(test_path, str(instance))
|
|
|
|
|
|
|
|
max_instances = max(max_instances, instance)
|
|
|
|
|
|
|
|
# Buffer the results
|
|
|
|
all_results.append(results)
|
|
|
|
|
|
|
|
# Create a header
|
|
|
|
header = "TestId"
|
|
|
|
for i in range(0, max_instances):
|
|
|
|
header += ",Trial" + str(i)
|
|
|
|
print(header)
|
|
|
|
|
|
|
|
# Print a fully-populated table of results
|
|
|
|
for r in all_results:
|
|
|
|
while len(r) < max_instances + 1:
|
|
|
|
r.append("")
|
|
|
|
print(",".join(r))
|
|
|
|
|
|
|
|
|
|
|
|
###############################################################################
|
|
|
|
if __name__ == "__main__":
|
|
|
|
script_path = os.path.realpath(__file__)
|
|
|
|
script_name = os.path.basename(script_path)
|
|
|
|
script_dir = os.path.dirname(script_path)
|
|
|
|
|
|
|
|
# Path to the default results directory
|
|
|
|
# (relative to this script, up on directory, then into the results folder)
|
|
|
|
default_results_dir = os.path.realpath(
|
|
|
|
os.path.join(script_dir, os.path.pardir, "results", "gaia_validation_level_1__two_agents_gpt4")
|
|
|
|
)
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description=f"""
|
|
|
|
{script_name} will collate the results of the GAIA scenarios and output them to a CSV. The CSV format is as follows:
|
|
|
|
|
|
|
|
TestId, Trial0, Trial1, ..., TrialN
|
|
|
|
uuid_1, x_10, x_11, ..., X_1N
|
|
|
|
uuid_2, x_20, x_21, ..., X_2N
|
|
|
|
...
|
|
|
|
uuid_M, x_M0, x_M1, ..., X_MN
|
|
|
|
|
|
|
|
Where uuid_i is the identifier of the ith test question, and x_ij is 1 or -1 depending on if the test passed or failed, respectively. If data for the trial is missing (e.g., due to a runtime error, the value will be an empty string "".
|
|
|
|
""".strip(),
|
|
|
|
formatter_class=argparse.RawTextHelpFormatter,
|
|
|
|
)
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
"scenario",
|
|
|
|
nargs="?",
|
|
|
|
help="Path to the scenario results. (default: " + default_results_dir + ")",
|
|
|
|
default=default_results_dir,
|
|
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
collate(args.scenario)
|