import os import json import re import sys import argparse def normalize_answer(a): # Lower case # Trim (left and right) # Replace multiple spaces with one space # Remove trailing punctuation return re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", a.strip().lower())) def collate(results_dir): """ Collate the results of running GAIA Args: results_dir (path): The folder were results were be saved. """ all_results = list() max_instances = 0 for test_id in os.listdir(results_dir): test_path = os.path.join(results_dir, test_id) # Collect the reslts vector results = [test_id] instance = 0 instance_dir = os.path.join(test_path, str(instance)) while os.path.isdir(instance_dir): expected_answer_file = os.path.join(instance_dir, "expected_answer.txt") if not os.path.isfile(expected_answer_file): # Expected ansewr is missing results.append("") instance += 1 instance_dir = os.path.join(test_path, str(instance)) continue expected_answer = "!!!NULL ANSWER!!!" with open(expected_answer_file, "rt") as fh: expected_answer = fh.read().strip() console_log_file = os.path.join(instance_dir, "console_log.txt") if not os.path.isfile(console_log_file): # Console log file missing results.append("") instance += 1 instance_dir = os.path.join(test_path, str(instance)) continue with open(console_log_file, "rt") as fh: console_log = fh.read() final_answer = "" m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL) if m: final_answer = m.group(1).strip() # print(f"Expected Answer: {expected_answer}\nAutogen Answer: {final_answer}\n") if normalize_answer(expected_answer) == normalize_answer(final_answer): results.append("1") else: results.append("-1") instance += 1 instance_dir = os.path.join(test_path, str(instance)) max_instances = max(max_instances, instance) # Buffer the results all_results.append(results) # Create a header header = "TestId" for i in range(0, max_instances): header += ",Trial" + str(i) print(header) # Print a fully-populated table of results for r in all_results: while len(r) < max_instances + 1: r.append("") print(",".join(r)) ############################################################################### if __name__ == "__main__": script_path = os.path.realpath(__file__) script_name = os.path.basename(script_path) script_dir = os.path.dirname(script_path) # Path to the default results directory # (relative to this script, up on directory, then into the results folder) default_results_dir = os.path.realpath( os.path.join(script_dir, os.path.pardir, "results", "gaia_validation_level_1__two_agents_gpt4") ) parser = argparse.ArgumentParser( description=f""" {script_name} will collate the results of the GAIA scenarios and output them to a CSV. The CSV format is as follows: TestId, Trial0, Trial1, ..., TrialN uuid_1, x_10, x_11, ..., X_1N uuid_2, x_20, x_21, ..., X_2N ... uuid_M, x_M0, x_M1, ..., X_MN Where uuid_i is the identifier of the ith test question, and x_ij is 1 or -1 depending on if the test passed or failed, respectively. If data for the trial is missing (e.g., due to a runtime error, the value will be an empty string "". """.strip(), formatter_class=argparse.RawTextHelpFormatter, ) parser.add_argument( "scenario", nargs="?", help="Path to the scenario results. (default: " + default_results_dir + ")", default=default_results_dir, ) args = parser.parse_args() collate(args.scenario)