Version 0.0.2 of Autogenbench (#1548)

* Prints the version of AutoGenBench from the command line, closing i1458 * Added autogenbench version to timestamp.txt * Attempting to fix formatting. * Add a gitignore for autogenbench * Generalize to read all template dirs from Templates * AutoGenBench logs telemetry when available. * Remove spaces if present from template names. * Bump version. * Fixed formatting. * Allow native warning to be skipped. Mount autogen repo in Docker if it can be found (experimental). * Native execution now occurs in a venv. * Bump version. * Fixed a prompt escaping bug evident in GAIA task '6f37996b-2ac7-44b0-8e68-6d28256631b4' * Updated all scenarios to use template discovery. * Update with main version of runtime_logging. --------- Co-authored-by: gagb <gagb@users.noreply.github.com>
2025-11-02 10:50:03 +00:00 · 2024-02-24 10:12:57 -08:00 · 2024-02-24 10:12:57 -08:00 · 085bf6cf3d
commit 085bf6cf3d
parent 477598afff
15 changed files with 202 additions and 53 deletions
--- a/samples/tools/autogenbench/.gitignore
+++ b/samples/tools/autogenbench/.gitignore
@ -0,0 +1,3 @@
+scenarios/*/Downloads
+scenarios/*/Tasks
+*/Results
--- a/samples/tools/autogenbench/autogenbench/cli.py
+++ b/samples/tools/autogenbench/autogenbench/cli.py
@ -1,4 +1,5 @@
 import sys
+from .version import __version__
 from .run_cmd import run_cli
 from .clone_cmd import clone_cli
 from .tabulate_cmd import tabulate_cli
@ -9,6 +10,7 @@ def main(args=None):
        args = sys.argv[:]  # Shallow copy

    invocation_cmd = "autogenbench"
+    version_string = f"AutoGenBench version {__version__}"

    commands = [
        {
@ -26,6 +28,11 @@ def main(args=None):
            "description": "tabulate the results of a previous run",
            "function": tabulate_cli,
        },
+        {
+            "command": "--version",
+            "description": f"print the version of {invocation_cmd}",
+            "function": lambda _args: print(f"{version_string}"),
+        },
        {"command": "--help", "description": "print this message", "function": None},
    ]

@ -40,6 +47,8 @@ def main(args=None):
        commands_details += f"    {padded_cmd}: {c['description']}\n"

    usage_text = f"""
+{version_string}
+
 usage: {invocation_cmd} COMMAND ARGS

 Where, COMMAND is one of: {commands_list}
@ -49,6 +58,8 @@ and ARGS are specific to the command.
 """.strip()

    help_text = f"""
+{version_string}
+
 usage: {invocation_cmd} COMMAND ARGS

 {invocation_cmd} is a tool for running and managing AutoGen benchmark scenarios. A typically session might resemble:
--- a/samples/tools/autogenbench/autogenbench/run_cmd.py
+++ b/samples/tools/autogenbench/autogenbench/run_cmd.py
@ -11,6 +11,7 @@ import docker
 import random
 from autogen import config_list_from_json
 from autogen.oai.openai_utils import filter_config
+from .version import __version__

 # Figure out where everything is
 SCRIPT_PATH = os.path.realpath(__file__)
@ -247,17 +248,25 @@ def get_scenario_env(config_list, env_file=DEFAULT_ENV_FILE):
    Returns: A dictionary of keys and values that need to be added to the system environment.
    """
    env = dict()
-    if os.path.isfile(env_file):
-        with open(env_file, "rt") as fh:
-            env = json.loads(fh.read())
-
-    config_list_json = json.dumps(config_list)
-    env["OAI_CONFIG_LIST"] = config_list_json

+    # Populate with commonly needed keys
    openai_api_key = os.environ.get("OPENAI_API_KEY")
    if openai_api_key is not None and len(openai_api_key.strip()) > 0:
        env["OPENAI_API_KEY"] = openai_api_key

+    bing_api_key = os.environ.get("BING_API_KEY")
+    if bing_api_key is not None and len(bing_api_key.strip()) > 0:
+        env["BING_API_KEY"] = bing_api_key
+
+    # Update with any values from the ENV.json file
+    if os.path.isfile(env_file):
+        with open(env_file, "rt") as fh:
+            env.update(json.loads(fh.read()))
+
+    # Include the config_list that we are using
+    config_list_json = json.dumps(config_list)
+    env["OAI_CONFIG_LIST"] = config_list_json
+
    return env


@ -286,6 +295,12 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
            f"""#
 echo RUN.SH STARTING !#!#
 export AUTOGEN_TESTBED_SETTING="Native"
+echo "autogenbench version: {__version__}" > timestamp.txt
+
+# Create and activate the virtual environment
+# This is called in a subprocess, and will not impact the parent
+{sys.executable} -m venv .autogenbench_venv
+. .autogenbench_venv/bin/activate

 # Run the global init script if it exists
 if [ -f global_init.sh ] ; then
@ -298,6 +313,7 @@ if [ -f scenario_init.sh ] ; then
 fi

 # Run the scenario
+pip install -r requirements.txt
 echo SCENARIO.PY STARTING !#!#
 timeout --preserve-status --kill-after {timeout  + 30}s {timeout}s python scenario.py
 EXIT_CODE=$?
@ -312,6 +328,10 @@ if [ -d .cache ] ; then
    rm -Rf .cache
 fi

+if [ -d __pycache__ ] ; then
+    rm -Rf __pycache__
+fi
+
 # Run the scenario finalize script if it exists
 if [ -f scenario_finalize.sh ] ; then
    . ./scenario_finalize.sh
@ -322,6 +342,12 @@ if [ -f global_finalize.sh ] ; then
    . ./global_finalize.sh
 fi

+# We don't need to deactivate the venv because it's
+# contained in the subprocess; but we should clean it up
+if [ -d .autogenbench_venv ] ; then
+    rm -Rf .autogenbench_venv
+fi
+
 echo RUN.SH COMPLETE !#!#
 """
        )
@ -387,7 +413,9 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
            f"""#
 echo RUN.SH STARTING !#!#
 export AUTOGEN_TESTBED_SETTING="Docker"
+
 umask 000
+echo "autogenbench version: {__version__}" > timestamp.txt

 # Run the global init script if it exists
 if [ -f global_init.sh ] ; then
@ -415,6 +443,10 @@ if [ -d .cache ] ; then
    rm -Rf .cache
 fi

+if [ -d __pycache__ ] ; then
+    rm -Rf __pycache__
+fi
+
 # Run the scenario finalize script if it exists
 if [ -f scenario_finalize.sh ] ; then
    . ./scenario_finalize.sh
@ -429,18 +461,31 @@ echo RUN.SH COMPLETE !#!#
 """
        )

-    print("\n\n" + work_dir + "\n===================================================================")
+    # Figure out what folders to mount
+    volumes = {str(pathlib.Path(work_dir).absolute()): {"bind": "/workspace", "mode": "rw"}}
+
+    # Add the autogen repo if we can find it
+    autogen_repo_base = os.environ.get("AUTOGENBENCH_REPO_BASE")
+    if autogen_repo_base is None:
+        autogen_repo_base = find_autogen_repo(os.getcwd())
+    elif not os.path.isdir(autogen_repo_base):
+        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), autogen_repo_base)
+
+    if autogen_repo_base is not None:
+        volumes[str(pathlib.Path(autogen_repo_base).absolute())] = {"bind": "/autogen", "mode": "rw"}
+
+    print("Mounting:")
+    for k in volumes:
+        bind = volumes[k]["bind"]
+        mode = volumes[k]["mode"].upper()
+        if bind == "/workspace":
+            k = os.path.relpath(k)
+        print(f"[{mode}]\t'{k}' => '{bind}'")
+    print("===================================================================")

    # Create and run the container
-    abs_path = str(pathlib.Path(work_dir).absolute())
    container = client.containers.run(
-        image,
-        command=["sh", "run.sh"],
-        working_dir="/workspace",
-        environment=env,
-        detach=True,
-        # get absolute path to the working directory
-        volumes={abs_path: {"bind": "/workspace", "mode": "rw"}},
+        image, command=["sh", "run.sh"], working_dir="/workspace", environment=env, detach=True, volumes=volumes
    )

    # Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
@ -485,6 +530,34 @@ def build_default_docker_image(docker_client, image_tag):
            sys.stdout.write(segment["stream"])


+def find_autogen_repo(path):
+    """
+    Utility for identifying if the path is a subdirectory of the autogen repo.
+
+    Returns: the path to the root of the autogen repo if one is found, otherwise None
+    """
+
+    # Normalize the path (we expect a directory)
+    path = os.path.abspath(path)
+    if os.path.isfile(path):
+        path = os.path.dirname(path)
+
+    while True:
+        test_path = os.path.join(path, "autogen", "agentchat", "conversable_agent.py")  # We found autogen
+        if os.path.isfile(test_path):
+            return path
+
+        # Stop if we hit the root
+        parent_dir = os.path.abspath(os.path.join(path, os.pardir))
+        if parent_dir == path:
+            break
+
+        # Keep searching
+        path = parent_dir
+
+    return None
+
+
 def run_cli(args):
    invocation_cmd = args[0]
    args = args[1:]
@ -581,12 +654,23 @@ def run_cli(args):
        if parsed_args.requirements is not None:
            sys.exit("--requirements is not compatible with --native. Exiting.")

-        choice = input(
-            'WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\nAre you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
+        sys.stderr.write(
+            "WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n"
        )

-        if choice.strip().lower() != "yes":
-            sys.exit("Received '" + choice + "'. Exiting.")
+        # Does an environment variable override the prompt?
+        allow_native = os.environ.get("AUTOGENBENCH_ALLOW_NATIVE")
+        if allow_native is None or allow_native == "":
+            choice = input(
+                'Are you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
+            )
+            if choice.strip().lower() != "yes":
+                sys.exit("Received '" + choice + "'. Exiting.")
+        elif allow_native.strip().lower() != "yes":
+            sys.exit(f"Exiting because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
+        else:
+            sys.stderr.write(f"Continuing because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
+            time.sleep(0.75)  # Pause very briefly so the message isn't lost in the noise

    # Parse the subsample
    subsample = None
--- a/samples/tools/autogenbench/autogenbench/template/testbed_utils.py
+++ b/samples/tools/autogenbench/autogenbench/template/testbed_utils.py
@ -6,6 +6,15 @@ import json

 AUTOGEN_VERSION = packaging.version.parse(autogen.__version__)

+# Try importing the runtime_logging module (only available in some branches)
+LOGGING_ENABLED = False
+try:
+    import autogen.runtime_logging
+
+    LOGGING_ENABLED = True
+except ImportError:
+    pass
+

 def default_llm_config(config_list, timeout=180):
    """Return a default config list with a given timeout, and with caching disabled.
@ -57,6 +66,10 @@ def init():
    if AUTOGEN_VERSION < packaging.version.parse("0.2.0b1"):
        autogen.Completion.start_logging(compact=False)

+    # Start logging
+    if LOGGING_ENABLED:
+        autogen.runtime_logging.start(config={"dbname": "telemetry.db"})
+

 def finalize(agents):
    """Helper function to finalize logging in a testbed scenario.
@ -89,3 +102,7 @@ def finalize(agents):
        with open(os.path.join(script_dir, "completion_log.json"), "wt") as fh:
            fh.write(json.dumps(autogen.Completion.logged_history, indent=4))
        autogen.Completion.stop_logging()
+
+    # Stop logging
+    if LOGGING_ENABLED:
+        autogen.runtime_logging.stop()
--- a/samples/tools/autogenbench/autogenbench/version.py
+++ b/samples/tools/autogenbench/autogenbench/version.py
@ -1 +1 @@
-__version__ = "0.0.1"
+__version__ = "0.0.2a4"
--- a/samples/tools/autogenbench/pyproject.toml
+++ b/samples/tools/autogenbench/pyproject.toml
@ -47,3 +47,8 @@ exclude = ["*.tests*"]

 [project.scripts]
 autogenbench = "autogenbench.cli:main"
+
+[tool.black]
+# https://github.com/psf/black
+line-length = 120
+exclude = "(.eggs|.git|.hg|.mypy_cache|.venv|_build|buck-out|build|dist)"
--- a/samples/tools/autogenbench/scenarios/AutoGPT/Scripts/init_tasks.py
+++ b/samples/tools/autogenbench/scenarios/AutoGPT/Scripts/init_tasks.py
@ -8,6 +8,7 @@ import os
 import sys
 import glob
 import base64
+import re
 from huggingface_hub import snapshot_download

 SCRIPT_PATH = os.path.realpath(__file__)
@ -88,7 +89,12 @@ def create_jsonl(name, template):

 ###############################################################################
 def main():
-    templates = {"two_agents": os.path.join(TEMPLATES_DIR, "TwoAgents")}
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[re.sub(r"\s", "", entry.name)] = entry.path

    # Add coding directories if needed (these are usually empty and left out of the repo)
    for template in templates.values():
--- a/samples/tools/autogenbench/scenarios/GAIA/MANIFEST.json
+++ b/samples/tools/autogenbench/scenarios/GAIA/MANIFEST.json
@ -4,9 +4,11 @@
        "Scripts/init_tasks.py": "Scripts/init_tasks.py",
        "Scripts/custom_tabulate.py": "Scripts/custom_tabulate.py",
        "Templates/BasicTwoAgents/expected_answer.txt": "Templates/BasicTwoAgents/expected_answer.txt",
+        "Templates/BasicTwoAgents/prompt.txt": "Templates/BasicTwoAgents/prompt.txt",
        "Templates/BasicTwoAgents/scenario.py": "Templates/BasicTwoAgents/scenario.py",
-        "Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
        "Templates/SocietyOfMind/expected_answer.txt": "Templates/SocietyOfMind/expected_answer.txt",
+        "Templates/SocietyOfMind/prompt.txt": "Templates/SocietyOfMind/prompt.txt",
+        "Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
        "Templates/SocietyOfMind/requirements.txt": "Templates/SocietyOfMind/requirements.txt"
    }
 }
--- a/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
+++ b/samples/tools/autogenbench/scenarios/GAIA/Scripts/init_tasks.py
@ -6,6 +6,7 @@
 import json
 import os
 import sys
+import re
 from huggingface_hub import snapshot_download

 SCRIPT_PATH = os.path.realpath(__file__)
@ -60,9 +61,9 @@ def create_jsonl(name, tasks, files_dir, template):
                "substitutions": {
                    "scenario.py": {
                        "__FILE_NAME__": task["file_name"],
-                        "__PROMPT__": task["Question"],
                    },
                    "expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
+                    "prompt.txt": {"__PROMPT__": task["Question"]},
                },
            }

@ -97,10 +98,12 @@ def main():

            gaia_test_tasks[data["Level"] - 1].append(data)

-    templates = {
-        "two_agents": os.path.join(TEMPLATES_DIR, "BasicTwoAgents"),
-        "soc": os.path.join(TEMPLATES_DIR, "SocietyOfMind"),
-    }
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[re.sub(r"\s", "", entry.name)] = entry.path

    # Add coding directories if needed (these are usually empty and left out of the repo)
    for template in templates.values():
--- a/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/prompt.txt
+++ b/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/prompt.txt
@ -0,0 +1 @@
+__PROMPT__
--- a/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py
+++ b/samples/tools/autogenbench/scenarios/GAIA/Templates/BasicTwoAgents/scenario.py
@ -7,6 +7,10 @@ import testbed_utils
 testbed_utils.init()
 ##############################

+# Read the prompt
+PROMPT = ""
+with open("prompt.txt", "rt") as fh:
+    PROMPT = fh.read().strip()

 GAIA_SYSTEM_MESSAGE = (
    "You are a helpful AI assistant, and today's date is "
@ -48,9 +52,7 @@ user_proxy = autogen.UserProxyAgent(
 )

 filename = "__FILE_NAME__".strip()
-question = """
-__PROMPT__
-""".strip()
+question = PROMPT

 if len(filename) > 0:
    question = f"Consider the file '{filename}', which can be read from the current working directory. If you need to read or write it, output python code in a code block (```python) to do so. {question}"
--- a/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/prompt.txt
+++ b/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/prompt.txt
@ -0,0 +1 @@
+__PROMPT__
--- a/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/scenario.py
+++ b/samples/tools/autogenbench/scenarios/GAIA/Templates/SocietyOfMind/scenario.py
@ -15,6 +15,11 @@ from autogen.token_count_utils import count_token, get_max_token_limit
 testbed_utils.init()
 ##############################

+# Read the prompt
+PROMPT = ""
+with open("prompt.txt", "rt") as fh:
+    PROMPT = fh.read().strip()
+
 config_list = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={"model": ["gpt-4"]},
@ -46,9 +51,9 @@ def response_preparer(inner_messages):
    messages = [
        {
            "role": "user",
-            "content": """Earlier you were asked the following:
+            "content": f"""Earlier you were asked the following:

-__PROMPT__
+{PROMPT}

 Your team then worked diligently to address that request. Here is a transcript of that conversation:""",
        }
@ -69,10 +74,10 @@ Your team then worked diligently to address that request. Here is a transcript o
    messages.append(
        {
            "role": "user",
-            "content": """
+            "content": f"""
 Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:

-__PROMPT__
+{PROMPT}

 To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
@ -140,7 +145,7 @@ if len(filename_prompt) > 0:
 question = f"""
 Below I will pose a question to you that I would like you to answer. You should begin by listing all the relevant facts necessary to derive an answer, then fill in those facts from memory where possible, including specific names, numbers and statistics. You are Ken Jennings-level with trivia, and Mensa-level with puzzles, so there should be a deep well to draw from. After listing the facts, begin to solve the question in earnest. Here is the question:

-{filename_prompt}__PROMPT__
+{filename_prompt}{PROMPT}
 """.strip()

 groupchat = GroupChatModerator(
--- a/samples/tools/autogenbench/scenarios/HumanEval/Scripts/init_tasks.py
+++ b/samples/tools/autogenbench/scenarios/HumanEval/Scripts/init_tasks.py
@ -8,6 +8,7 @@ import gzip
 import io
 import json
 import os
+import re
 import base64

 URL = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
@ -16,7 +17,13 @@ SCRIPT_PATH = os.path.realpath(__file__)
 SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
 SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)

+SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
+TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
+TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
+
 # A selected subset of HumanEval problems to work with during development
+
+# Deprecated 2/5/2024 -- Use subsample instead
 REDUCED_SET = [
    "HumanEval/2",
    "HumanEval/26",
@ -73,19 +80,17 @@ def create_jsonl(name, tasks, template):
    """Creates a JSONL scenario file with a given name, list of HumanEval tasks, and template path."""

    # Create a task directory if it doesn't exist
-    scenario_dir = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
-    task_dir = os.path.join(scenario_dir, "Tasks")
-    if not os.path.isdir(task_dir):
-        os.mkdir(task_dir)
+    if not os.path.isdir(TASKS_DIR):
+        os.mkdir(TASKS_DIR)

    # Create the jsonl file
-    with open(os.path.join(task_dir, name + ".jsonl"), "wt") as fh:
+    with open(os.path.join(TASKS_DIR, name + ".jsonl"), "wt") as fh:
        for task in tasks:
            print(f"Converting: [{name}] {task['task_id']}")

            record = {
                "id": task["task_id"].replace("/", "_"),
-                "template": os.path.join(os.path.pardir, template),
+                "template": template,
                "substitutions": {
                    "scenario.py": {
                        "__ENTRY_POINT__": task["entry_point"],
@ -102,19 +107,19 @@ def create_jsonl(name, tasks, template):
 ###############################################################################
 def main():
    human_eval = download_human_eval()
-    reduced_human_eval = [t for t in human_eval if t["task_id"] in REDUCED_SET]
+    # Deprecated: reduced_human_eval = [t for t in human_eval if t["task_id"] in REDUCED_SET]

-    templates = {
-        "two_agents": "Templates/TwoAgents",
-        # "gc3_distractor": "Templates/GroupChatThreeAgents_Distractor",
-        # "gc3_guardrails": "Templates/GroupChatThreeAgents_Guardrails",
-        # "gc4": "Templates/GroupChatFourAgents",
-    }
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[re.sub(r"\s", "", entry.name)] = entry.path

    # Create the various combinations of [models] x [templates]
    for t in templates.items():
        create_jsonl(f"human_eval_{t[0]}", human_eval, t[1])
-        create_jsonl(f"r_human_eval_{t[0]}", reduced_human_eval, t[1])
+        # Deprecated: create_jsonl(f"r_human_eval_{t[0]}", reduced_human_eval, t[1])


 if __name__ == "__main__" and __package__ is None:
--- a/samples/tools/autogenbench/scenarios/MATH/Scripts/init_tasks.py
+++ b/samples/tools/autogenbench/scenarios/MATH/Scripts/init_tasks.py
@ -8,6 +8,7 @@ import tarfile
 import io
 import json
 import os
+import re
 import sys

 URL = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
@ -91,7 +92,7 @@ def create_jsonl(name, problems, template):

            record = {
                "id": task_id,
-                "template": os.path.join(os.path.pardir, template),
+                "template": template,
                "substitutions": {
                    "prompt.txt": {"__PROMPT__": data["problem"]},
                    "expected_answer.txt": {"__ANSWER__": data["solution"]},
@ -105,9 +106,12 @@ def create_jsonl(name, problems, template):
 def main():
    problems = download_math()

-    templates = {
-        "two_agents": "Templates/TwoAgents",
-    }
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[re.sub(r"\s", "", entry.name)] = entry.path

    for t in templates.items():
        create_jsonl(f"math_{t[0]}", problems, t[1])