autogen/samples/tools/testbed/utils/expand_gaia.py

#
# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:
# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)
#

import json
import os
import sys
import shutil

SCRIPT_PATH = os.path.realpath(__file__)
SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
SCENARIOS_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir, "scenarios", "GAIA"))


def create_jsonl(name, tasks, template, model):
    """Creates a JSONL scenario file with a given name, list of HumanEval tasks, template path, and model."""

    with open(os.path.join(SCENARIOS_DIR, name + ".jsonl"), "wt") as fh:
        for task in tasks:
            print(f"Converting: [{name}] {task['task_id']}")

            # Figure out what files we need to copy
            template_cp_list = [template]
            if len(task["file_name"].strip()) > 0:
                template_cp_list.append(
                    [
                        os.path.join("GAIA_Files", task["file_name"].strip()),
                        os.path.join("coding", task["file_name"].strip()),
                    ]
                )

            record = {
                "id": task["task_id"],
                "template": template_cp_list,
                "substitutions": {
                    "scenario.py": {
                        "__MODEL__": model,
                        "__FILE_NAME__": task["file_name"],
                        "__PROMPT__": task["Question"],
                    },
                    "expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
                },
            }

            fh.write(json.dumps(record).strip() + "\n")


###############################################################################
if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.exit(
            f"SYNTAX: python {SCRIPT_NAME} [path to GIA repository]\n\nNote: to clone the GAIA repository, do 'git clone https://huggingface.co/datasets/gaia-benchmark/GAIA'"
        )

    # Copy the relevant GAIA files
    gaia_path = os.path.realpath(sys.argv[1])

    gaia_validation_files = os.path.join(gaia_path, "2023", "validation")
    gaia_test_files = os.path.join(gaia_path, "2023", "test")

    if not os.path.isdir(gaia_validation_files) or not os.path.isdir(gaia_test_files):
        sys.exit(f"Error: '{gaia_path}' does not appear to be a copy of the GAIA repository.")

    gaia_merged_files = os.path.realpath(os.path.join(SCENARIOS_DIR, "GAIA_Files"))

    shutil.copytree(
        gaia_validation_files, gaia_merged_files, ignore=shutil.ignore_patterns("metadata.jsonl"), dirs_exist_ok=True
    )
    shutil.copytree(
        gaia_test_files, gaia_merged_files, ignore=shutil.ignore_patterns("metadata.jsonl"), dirs_exist_ok=True
    )

    # Load the GAIA data
    gaia_validation_tasks = [[], [], []]
    with open(os.path.join(gaia_validation_files, "metadata.jsonl")) as fh:
        for line in fh:
            data = json.loads(line)
            gaia_validation_tasks[data["Level"] - 1].append(data)

    gaia_test_tasks = [[], [], []]
    with open(os.path.join(gaia_test_files, "metadata.jsonl")) as fh:
        for line in fh:
            data = json.loads(line)
            gaia_test_tasks[data["Level"] - 1].append(data)

    models = {
        "gpt4": "gpt-4",
    }

    templates = {
        "two_agents": "Templates/BasicTwoAgents",
    }

    # Add coding directories if needed (these are usually empty and left out of the repo)
    for template in templates.values():
        code_dir_path = os.path.join(SCENARIOS_DIR, template, "coding")
        if not os.path.isdir(code_dir_path):
            os.mkdir(code_dir_path)

    # Create the various combinations of [models] x [templates]
    for m in models.items():
        for t in templates.items():
            create_jsonl(f"gaia_validation_level_1__{t[0]}_{m[0]}", gaia_validation_tasks[0], t[1], m[1])
            create_jsonl(f"gaia_validation_level_2__{t[0]}_{m[0]}", gaia_validation_tasks[1], t[1], m[1])
            create_jsonl(f"gaia_validation_level_3__{t[0]}_{m[0]}", gaia_validation_tasks[2], t[1], m[1])
            create_jsonl(f"gaia_test_level_1__{t[0]}_{m[0]}", gaia_test_tasks[0], t[1], m[1])
            create_jsonl(f"gaia_test_level_2__{t[0]}_{m[0]}", gaia_test_tasks[1], t[1], m[1])
            create_jsonl(f"gaia_test_level_3__{t[0]}_{m[0]}", gaia_test_tasks[2], t[1], m[1])
Adds the GAIA benchark to the Testbed. This PR depends on #792 (#810) * Re-added completion logging when using older versions of autogen. * Extended scenario definitions and templating to include folders. * Prepare collate_human_eval.py for working with group chat scenarios. * Converted HumanEval to the folder-based approach, and added GroupChat scenarios. * Fixed the default termination message. * Fixed another termination condition. * Updated compatible autogen versions. * Added initial support for GAIA benchmark. * Fixed a bug in executing the finalize scripts. * Generalized the template further to support multiple folder copy operations. * Refined GAIA support, and broke scenarios down by difficulty. * Added some experimental scripts for computing metrics over GAIA. This is a first version, and will likely need refinement. * Added instructions for cloning GAIA * Updated README to fix some typos. * Added a script to format GAIA reslts for the leaderboard. * Update samples/tools/testbed/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py Co-authored-by: LeoLjl <3110503618@qq.com> --------- Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu> Co-authored-by: LeoLjl <3110503618@qq.com> 2023-12-05 17:46:10 -08:00			`#`
			`# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:`
			`# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)`
			`#`

			`import json`
			`import os`
			`import sys`
			`import shutil`

			`SCRIPT_PATH = os.path.realpath(__file__)`
			`SCRIPT_NAME = os.path.basename(SCRIPT_PATH)`
			`SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)`
			`SCENARIOS_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir, "scenarios", "GAIA"))`


			`def create_jsonl(name, tasks, template, model):`
			`"""Creates a JSONL scenario file with a given name, list of HumanEval tasks, template path, and model."""`

			`with open(os.path.join(SCENARIOS_DIR, name + ".jsonl"), "wt") as fh:`
			`for task in tasks:`
			`print(f"Converting: [{name}] {task['task_id']}")`

			`# Figure out what files we need to copy`
			`template_cp_list = [template]`
			`if len(task["file_name"].strip()) > 0:`
			`template_cp_list.append(`
			`[`
			`os.path.join("GAIA_Files", task["file_name"].strip()),`
			`os.path.join("coding", task["file_name"].strip()),`
			`]`
			`)`

			`record = {`
			`"id": task["task_id"],`
			`"template": template_cp_list,`
			`"substitutions": {`
			`"scenario.py": {`
			`"__MODEL__": model,`
			`"__FILE_NAME__": task["file_name"],`
			`"__PROMPT__": task["Question"],`
			`},`
			`"expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},`
			`},`
			`}`

			`fh.write(json.dumps(record).strip() + "\n")`


			`###############################################################################`
			`if __name__ == "__main__":`
			`if len(sys.argv) != 2:`
			`sys.exit(`
			`f"SYNTAX: python {SCRIPT_NAME} [path to GIA repository]\n\nNote: to clone the GAIA repository, do 'git clone https://huggingface.co/datasets/gaia-benchmark/GAIA'"`
			`)`

			`# Copy the relevant GAIA files`
			`gaia_path = os.path.realpath(sys.argv[1])`

			`gaia_validation_files = os.path.join(gaia_path, "2023", "validation")`
			`gaia_test_files = os.path.join(gaia_path, "2023", "test")`

			`if not os.path.isdir(gaia_validation_files) or not os.path.isdir(gaia_test_files):`
			`sys.exit(f"Error: '{gaia_path}' does not appear to be a copy of the GAIA repository.")`

			`gaia_merged_files = os.path.realpath(os.path.join(SCENARIOS_DIR, "GAIA_Files"))`

			`shutil.copytree(`
			`gaia_validation_files, gaia_merged_files, ignore=shutil.ignore_patterns("metadata.jsonl"), dirs_exist_ok=True`
			`)`
			`shutil.copytree(`
			`gaia_test_files, gaia_merged_files, ignore=shutil.ignore_patterns("metadata.jsonl"), dirs_exist_ok=True`
			`)`

			`# Load the GAIA data`
			`gaia_validation_tasks = [[], [], []]`
			`with open(os.path.join(gaia_validation_files, "metadata.jsonl")) as fh:`
			`for line in fh:`
			`data = json.loads(line)`
			`gaia_validation_tasks[data["Level"] - 1].append(data)`

			`gaia_test_tasks = [[], [], []]`
			`with open(os.path.join(gaia_test_files, "metadata.jsonl")) as fh:`
			`for line in fh:`
			`data = json.loads(line)`
			`gaia_test_tasks[data["Level"] - 1].append(data)`

			`models = {`
			`"gpt4": "gpt-4",`
			`}`

			`templates = {`
			`"two_agents": "Templates/BasicTwoAgents",`
			`}`

			`# Add coding directories if needed (these are usually empty and left out of the repo)`
			`for template in templates.values():`
			`code_dir_path = os.path.join(SCENARIOS_DIR, template, "coding")`
			`if not os.path.isdir(code_dir_path):`
			`os.mkdir(code_dir_path)`

			`# Create the various combinations of [models] x [templates]`
			`for m in models.items():`
			`for t in templates.items():`
			`create_jsonl(f"gaia_validation_level_1__{t[0]}_{m[0]}", gaia_validation_tasks[0], t[1], m[1])`
			`create_jsonl(f"gaia_validation_level_2__{t[0]}_{m[0]}", gaia_validation_tasks[1], t[1], m[1])`
			`create_jsonl(f"gaia_validation_level_3__{t[0]}_{m[0]}", gaia_validation_tasks[2], t[1], m[1])`
			`create_jsonl(f"gaia_test_level_1__{t[0]}_{m[0]}", gaia_test_tasks[0], t[1], m[1])`
			`create_jsonl(f"gaia_test_level_2__{t[0]}_{m[0]}", gaia_test_tasks[1], t[1], m[1])`
			`create_jsonl(f"gaia_test_level_3__{t[0]}_{m[0]}", gaia_test_tasks[2], t[1], m[1])`