mirror of
https://github.com/microsoft/autogen.git
synced 2025-11-02 10:50:03 +00:00
Version 0.0.2 of Autogenbench (#1548)
* Prints the version of AutoGenBench from the command line, closing i1458 * Added autogenbench version to timestamp.txt * Attempting to fix formatting. * Add a gitignore for autogenbench * Generalize to read all template dirs from Templates * AutoGenBench logs telemetry when available. * Remove spaces if present from template names. * Bump version. * Fixed formatting. * Allow native warning to be skipped. Mount autogen repo in Docker if it can be found (experimental). * Native execution now occurs in a venv. * Bump version. * Fixed a prompt escaping bug evident in GAIA task '6f37996b-2ac7-44b0-8e68-6d28256631b4' * Updated all scenarios to use template discovery. * Update with main version of runtime_logging. --------- Co-authored-by: gagb <gagb@users.noreply.github.com>
This commit is contained in:
parent
477598afff
commit
085bf6cf3d
3
samples/tools/autogenbench/.gitignore
vendored
Normal file
3
samples/tools/autogenbench/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
scenarios/*/Downloads
|
||||
scenarios/*/Tasks
|
||||
*/Results
|
||||
@ -1,4 +1,5 @@
|
||||
import sys
|
||||
from .version import __version__
|
||||
from .run_cmd import run_cli
|
||||
from .clone_cmd import clone_cli
|
||||
from .tabulate_cmd import tabulate_cli
|
||||
@ -9,6 +10,7 @@ def main(args=None):
|
||||
args = sys.argv[:] # Shallow copy
|
||||
|
||||
invocation_cmd = "autogenbench"
|
||||
version_string = f"AutoGenBench version {__version__}"
|
||||
|
||||
commands = [
|
||||
{
|
||||
@ -26,6 +28,11 @@ def main(args=None):
|
||||
"description": "tabulate the results of a previous run",
|
||||
"function": tabulate_cli,
|
||||
},
|
||||
{
|
||||
"command": "--version",
|
||||
"description": f"print the version of {invocation_cmd}",
|
||||
"function": lambda _args: print(f"{version_string}"),
|
||||
},
|
||||
{"command": "--help", "description": "print this message", "function": None},
|
||||
]
|
||||
|
||||
@ -40,6 +47,8 @@ def main(args=None):
|
||||
commands_details += f" {padded_cmd}: {c['description']}\n"
|
||||
|
||||
usage_text = f"""
|
||||
{version_string}
|
||||
|
||||
usage: {invocation_cmd} COMMAND ARGS
|
||||
|
||||
Where, COMMAND is one of: {commands_list}
|
||||
@ -49,6 +58,8 @@ and ARGS are specific to the command.
|
||||
""".strip()
|
||||
|
||||
help_text = f"""
|
||||
{version_string}
|
||||
|
||||
usage: {invocation_cmd} COMMAND ARGS
|
||||
|
||||
{invocation_cmd} is a tool for running and managing AutoGen benchmark scenarios. A typically session might resemble:
|
||||
|
||||
@ -11,6 +11,7 @@ import docker
|
||||
import random
|
||||
from autogen import config_list_from_json
|
||||
from autogen.oai.openai_utils import filter_config
|
||||
from .version import __version__
|
||||
|
||||
# Figure out where everything is
|
||||
SCRIPT_PATH = os.path.realpath(__file__)
|
||||
@ -247,17 +248,25 @@ def get_scenario_env(config_list, env_file=DEFAULT_ENV_FILE):
|
||||
Returns: A dictionary of keys and values that need to be added to the system environment.
|
||||
"""
|
||||
env = dict()
|
||||
if os.path.isfile(env_file):
|
||||
with open(env_file, "rt") as fh:
|
||||
env = json.loads(fh.read())
|
||||
|
||||
config_list_json = json.dumps(config_list)
|
||||
env["OAI_CONFIG_LIST"] = config_list_json
|
||||
|
||||
# Populate with commonly needed keys
|
||||
openai_api_key = os.environ.get("OPENAI_API_KEY")
|
||||
if openai_api_key is not None and len(openai_api_key.strip()) > 0:
|
||||
env["OPENAI_API_KEY"] = openai_api_key
|
||||
|
||||
bing_api_key = os.environ.get("BING_API_KEY")
|
||||
if bing_api_key is not None and len(bing_api_key.strip()) > 0:
|
||||
env["BING_API_KEY"] = bing_api_key
|
||||
|
||||
# Update with any values from the ENV.json file
|
||||
if os.path.isfile(env_file):
|
||||
with open(env_file, "rt") as fh:
|
||||
env.update(json.loads(fh.read()))
|
||||
|
||||
# Include the config_list that we are using
|
||||
config_list_json = json.dumps(config_list)
|
||||
env["OAI_CONFIG_LIST"] = config_list_json
|
||||
|
||||
return env
|
||||
|
||||
|
||||
@ -286,6 +295,12 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
|
||||
f"""#
|
||||
echo RUN.SH STARTING !#!#
|
||||
export AUTOGEN_TESTBED_SETTING="Native"
|
||||
echo "autogenbench version: {__version__}" > timestamp.txt
|
||||
|
||||
# Create and activate the virtual environment
|
||||
# This is called in a subprocess, and will not impact the parent
|
||||
{sys.executable} -m venv .autogenbench_venv
|
||||
. .autogenbench_venv/bin/activate
|
||||
|
||||
# Run the global init script if it exists
|
||||
if [ -f global_init.sh ] ; then
|
||||
@ -298,6 +313,7 @@ if [ -f scenario_init.sh ] ; then
|
||||
fi
|
||||
|
||||
# Run the scenario
|
||||
pip install -r requirements.txt
|
||||
echo SCENARIO.PY STARTING !#!#
|
||||
timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py
|
||||
EXIT_CODE=$?
|
||||
@ -312,6 +328,10 @@ if [ -d .cache ] ; then
|
||||
rm -Rf .cache
|
||||
fi
|
||||
|
||||
if [ -d __pycache__ ] ; then
|
||||
rm -Rf __pycache__
|
||||
fi
|
||||
|
||||
# Run the scenario finalize script if it exists
|
||||
if [ -f scenario_finalize.sh ] ; then
|
||||
. ./scenario_finalize.sh
|
||||
@ -322,6 +342,12 @@ if [ -f global_finalize.sh ] ; then
|
||||
. ./global_finalize.sh
|
||||
fi
|
||||
|
||||
# We don't need to deactivate the venv because it's
|
||||
# contained in the subprocess; but we should clean it up
|
||||
if [ -d .autogenbench_venv ] ; then
|
||||
rm -Rf .autogenbench_venv
|
||||
fi
|
||||
|
||||
echo RUN.SH COMPLETE !#!#
|
||||
"""
|
||||
)
|
||||
@ -387,7 +413,9 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
|
||||
f"""#
|
||||
echo RUN.SH STARTING !#!#
|
||||
export AUTOGEN_TESTBED_SETTING="Docker"
|
||||
|
||||
umask 000
|
||||
echo "autogenbench version: {__version__}" > timestamp.txt
|
||||
|
||||
# Run the global init script if it exists
|
||||
if [ -f global_init.sh ] ; then
|
||||
@ -415,6 +443,10 @@ if [ -d .cache ] ; then
|
||||
rm -Rf .cache
|
||||
fi
|
||||
|
||||
if [ -d __pycache__ ] ; then
|
||||
rm -Rf __pycache__
|
||||
fi
|
||||
|
||||
# Run the scenario finalize script if it exists
|
||||
if [ -f scenario_finalize.sh ] ; then
|
||||
. ./scenario_finalize.sh
|
||||
@ -429,18 +461,31 @@ echo RUN.SH COMPLETE !#!#
|
||||
"""
|
||||
)
|
||||
|
||||
print("\n\n" + work_dir + "\n===================================================================")
|
||||
# Figure out what folders to mount
|
||||
volumes = {str(pathlib.Path(work_dir).absolute()): {"bind": "/workspace", "mode": "rw"}}
|
||||
|
||||
# Add the autogen repo if we can find it
|
||||
autogen_repo_base = os.environ.get("AUTOGENBENCH_REPO_BASE")
|
||||
if autogen_repo_base is None:
|
||||
autogen_repo_base = find_autogen_repo(os.getcwd())
|
||||
elif not os.path.isdir(autogen_repo_base):
|
||||
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), autogen_repo_base)
|
||||
|
||||
if autogen_repo_base is not None:
|
||||
volumes[str(pathlib.Path(autogen_repo_base).absolute())] = {"bind": "/autogen", "mode": "rw"}
|
||||
|
||||
print("Mounting:")
|
||||
for k in volumes:
|
||||
bind = volumes[k]["bind"]
|
||||
mode = volumes[k]["mode"].upper()
|
||||
if bind == "/workspace":
|
||||
k = os.path.relpath(k)
|
||||
print(f"[{mode}]\t'{k}' => '{bind}'")
|
||||
print("===================================================================")
|
||||
|
||||
# Create and run the container
|
||||
abs_path = str(pathlib.Path(work_dir).absolute())
|
||||
container = client.containers.run(
|
||||
image,
|
||||
command=["sh", "run.sh"],
|
||||
working_dir="/workspace",
|
||||
environment=env,
|
||||
detach=True,
|
||||
# get absolute path to the working directory
|
||||
volumes={abs_path: {"bind": "/workspace", "mode": "rw"}},
|
||||
image, command=["sh", "run.sh"], working_dir="/workspace", environment=env, detach=True, volumes=volumes
|
||||
)
|
||||
|
||||
# Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
|
||||
@ -485,6 +530,34 @@ def build_default_docker_image(docker_client, image_tag):
|
||||
sys.stdout.write(segment["stream"])
|
||||
|
||||
|
||||
def find_autogen_repo(path):
|
||||
"""
|
||||
Utility for identifying if the path is a subdirectory of the autogen repo.
|
||||
|
||||
Returns: the path to the root of the autogen repo if one is found, otherwise None
|
||||
"""
|
||||
|
||||
# Normalize the path (we expect a directory)
|
||||
path = os.path.abspath(path)
|
||||
if os.path.isfile(path):
|
||||
path = os.path.dirname(path)
|
||||
|
||||
while True:
|
||||
test_path = os.path.join(path, "autogen", "agentchat", "conversable_agent.py") # We found autogen
|
||||
if os.path.isfile(test_path):
|
||||
return path
|
||||
|
||||
# Stop if we hit the root
|
||||
parent_dir = os.path.abspath(os.path.join(path, os.pardir))
|
||||
if parent_dir == path:
|
||||
break
|
||||
|
||||
# Keep searching
|
||||
path = parent_dir
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def run_cli(args):
|
||||
invocation_cmd = args[0]
|
||||
args = args[1:]
|
||||
@ -581,12 +654,23 @@ def run_cli(args):
|
||||
if parsed_args.requirements is not None:
|
||||
sys.exit("--requirements is not compatible with --native. Exiting.")
|
||||
|
||||
choice = input(
|
||||
'WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\nAre you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
|
||||
sys.stderr.write(
|
||||
"WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n"
|
||||
)
|
||||
|
||||
if choice.strip().lower() != "yes":
|
||||
sys.exit("Received '" + choice + "'. Exiting.")
|
||||
# Does an environment variable override the prompt?
|
||||
allow_native = os.environ.get("AUTOGENBENCH_ALLOW_NATIVE")
|
||||
if allow_native is None or allow_native == "":
|
||||
choice = input(
|
||||
'Are you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
|
||||
)
|
||||
if choice.strip().lower() != "yes":
|
||||
sys.exit("Received '" + choice + "'. Exiting.")
|
||||
elif allow_native.strip().lower() != "yes":
|
||||
sys.exit(f"Exiting because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
|
||||
else:
|
||||
sys.stderr.write(f"Continuing because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
|
||||
time.sleep(0.75) # Pause very briefly so the message isn't lost in the noise
|
||||
|
||||
# Parse the subsample
|
||||
subsample = None
|
||||
|
||||
@ -6,6 +6,15 @@ import json
|
||||
|
||||
AUTOGEN_VERSION = packaging.version.parse(autogen.__version__)
|
||||
|
||||
# Try importing the runtime_logging module (only available in some branches)
|
||||
LOGGING_ENABLED = False
|
||||
try:
|
||||
import autogen.runtime_logging
|
||||
|
||||
LOGGING_ENABLED = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def default_llm_config(config_list, timeout=180):
|
||||
"""Return a default config list with a given timeout, and with caching disabled.
|
||||
@ -57,6 +66,10 @@ def init():
|
||||
if AUTOGEN_VERSION < packaging.version.parse("0.2.0b1"):
|
||||
autogen.Completion.start_logging(compact=False)
|
||||
|
||||
# Start logging
|
||||
if LOGGING_ENABLED:
|
||||
autogen.runtime_logging.start(config={"dbname": "telemetry.db"})
|
||||
|
||||
|
||||
def finalize(agents):
|
||||
"""Helper function to finalize logging in a testbed scenario.
|
||||
@ -89,3 +102,7 @@ def finalize(agents):
|
||||
with open(os.path.join(script_dir, "completion_log.json"), "wt") as fh:
|
||||
fh.write(json.dumps(autogen.Completion.logged_history, indent=4))
|
||||
autogen.Completion.stop_logging()
|
||||
|
||||
# Stop logging
|
||||
if LOGGING_ENABLED:
|
||||
autogen.runtime_logging.stop()
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.0.1"
|
||||
__version__ = "0.0.2a4"
|
||||
|
||||
@ -47,3 +47,8 @@ exclude = ["*.tests*"]
|
||||
|
||||
[project.scripts]
|
||||
autogenbench = "autogenbench.cli:main"
|
||||
|
||||
[tool.black]
|
||||
# https://github.com/psf/black
|
||||
line-length = 120
|
||||
exclude = "(.eggs|.git|.hg|.mypy_cache|.venv|_build|buck-out|build|dist)"
|
||||
|
||||
@ -8,6 +8,7 @@ import os
|
||||
import sys
|
||||
import glob
|
||||
import base64
|
||||
import re
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
SCRIPT_PATH = os.path.realpath(__file__)
|
||||
@ -88,7 +89,12 @@ def create_jsonl(name, template):
|
||||
|
||||
###############################################################################
|
||||
def main():
|
||||
templates = {"two_agents": os.path.join(TEMPLATES_DIR, "TwoAgents")}
|
||||
# list all directories in the Templates directory
|
||||
# and populate a dictionary with the name and path
|
||||
templates = {}
|
||||
for entry in os.scandir(TEMPLATES_DIR):
|
||||
if entry.is_dir():
|
||||
templates[re.sub(r"\s", "", entry.name)] = entry.path
|
||||
|
||||
# Add coding directories if needed (these are usually empty and left out of the repo)
|
||||
for template in templates.values():
|
||||
|
||||
@ -4,9 +4,11 @@
|
||||
"Scripts/init_tasks.py": "Scripts/init_tasks.py",
|
||||
"Scripts/custom_tabulate.py": "Scripts/custom_tabulate.py",
|
||||
"Templates/BasicTwoAgents/expected_answer.txt": "Templates/BasicTwoAgents/expected_answer.txt",
|
||||
"Templates/BasicTwoAgents/prompt.txt": "Templates/BasicTwoAgents/prompt.txt",
|
||||
"Templates/BasicTwoAgents/scenario.py": "Templates/BasicTwoAgents/scenario.py",
|
||||
"Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
|
||||
"Templates/SocietyOfMind/expected_answer.txt": "Templates/SocietyOfMind/expected_answer.txt",
|
||||
"Templates/SocietyOfMind/prompt.txt": "Templates/SocietyOfMind/prompt.txt",
|
||||
"Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
|
||||
"Templates/SocietyOfMind/requirements.txt": "Templates/SocietyOfMind/requirements.txt"
|
||||
}
|
||||
}
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
SCRIPT_PATH = os.path.realpath(__file__)
|
||||
@ -60,9 +61,9 @@ def create_jsonl(name, tasks, files_dir, template):
|
||||
"substitutions": {
|
||||
"scenario.py": {
|
||||
"__FILE_NAME__": task["file_name"],
|
||||
"__PROMPT__": task["Question"],
|
||||
},
|
||||
"expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
|
||||
"prompt.txt": {"__PROMPT__": task["Question"]},
|
||||
},
|
||||
}
|
||||
|
||||
@ -97,10 +98,12 @@ def main():
|
||||
|
||||
gaia_test_tasks[data["Level"] - 1].append(data)
|
||||
|
||||
templates = {
|
||||
"two_agents": os.path.join(TEMPLATES_DIR, "BasicTwoAgents"),
|
||||
"soc": os.path.join(TEMPLATES_DIR, "SocietyOfMind"),
|
||||
}
|
||||
# list all directories in the Templates directory
|
||||
# and populate a dictionary with the name and path
|
||||
templates = {}
|
||||
for entry in os.scandir(TEMPLATES_DIR):
|
||||
if entry.is_dir():
|
||||
templates[re.sub(r"\s", "", entry.name)] = entry.path
|
||||
|
||||
# Add coding directories if needed (these are usually empty and left out of the repo)
|
||||
for template in templates.values():
|
||||
|
||||
@ -0,0 +1 @@
|
||||
__PROMPT__
|
||||
@ -7,6 +7,10 @@ import testbed_utils
|
||||
testbed_utils.init()
|
||||
##############################
|
||||
|
||||
# Read the prompt
|
||||
PROMPT = ""
|
||||
with open("prompt.txt", "rt") as fh:
|
||||
PROMPT = fh.read().strip()
|
||||
|
||||
GAIA_SYSTEM_MESSAGE = (
|
||||
"You are a helpful AI assistant, and today's date is "
|
||||
@ -48,9 +52,7 @@ user_proxy = autogen.UserProxyAgent(
|
||||
)
|
||||
|
||||
filename = "__FILE_NAME__".strip()
|
||||
question = """
|
||||
__PROMPT__
|
||||
""".strip()
|
||||
question = PROMPT
|
||||
|
||||
if len(filename) > 0:
|
||||
question = f"Consider the file '{filename}', which can be read from the current working directory. If you need to read or write it, output python code in a code block (```python) to do so. {question}"
|
||||
|
||||
@ -0,0 +1 @@
|
||||
__PROMPT__
|
||||
@ -15,6 +15,11 @@ from autogen.token_count_utils import count_token, get_max_token_limit
|
||||
testbed_utils.init()
|
||||
##############################
|
||||
|
||||
# Read the prompt
|
||||
PROMPT = ""
|
||||
with open("prompt.txt", "rt") as fh:
|
||||
PROMPT = fh.read().strip()
|
||||
|
||||
config_list = autogen.config_list_from_json(
|
||||
"OAI_CONFIG_LIST",
|
||||
filter_dict={"model": ["gpt-4"]},
|
||||
@ -46,9 +51,9 @@ def response_preparer(inner_messages):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": """Earlier you were asked the following:
|
||||
"content": f"""Earlier you were asked the following:
|
||||
|
||||
__PROMPT__
|
||||
{PROMPT}
|
||||
|
||||
Your team then worked diligently to address that request. Here is a transcript of that conversation:""",
|
||||
}
|
||||
@ -69,10 +74,10 @@ Your team then worked diligently to address that request. Here is a transcript o
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": """
|
||||
"content": f"""
|
||||
Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
|
||||
|
||||
__PROMPT__
|
||||
{PROMPT}
|
||||
|
||||
To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
|
||||
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
||||
@ -140,7 +145,7 @@ if len(filename_prompt) > 0:
|
||||
question = f"""
|
||||
Below I will pose a question to you that I would like you to answer. You should begin by listing all the relevant facts necessary to derive an answer, then fill in those facts from memory where possible, including specific names, numbers and statistics. You are Ken Jennings-level with trivia, and Mensa-level with puzzles, so there should be a deep well to draw from. After listing the facts, begin to solve the question in earnest. Here is the question:
|
||||
|
||||
{filename_prompt}__PROMPT__
|
||||
{filename_prompt}{PROMPT}
|
||||
""".strip()
|
||||
|
||||
groupchat = GroupChatModerator(
|
||||
|
||||
@ -8,6 +8,7 @@ import gzip
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import base64
|
||||
|
||||
URL = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
|
||||
@ -16,7 +17,13 @@ SCRIPT_PATH = os.path.realpath(__file__)
|
||||
SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
|
||||
SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
|
||||
|
||||
SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
|
||||
TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
|
||||
TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
|
||||
|
||||
# A selected subset of HumanEval problems to work with during development
|
||||
|
||||
# Deprecated 2/5/2024 -- Use subsample instead
|
||||
REDUCED_SET = [
|
||||
"HumanEval/2",
|
||||
"HumanEval/26",
|
||||
@ -73,19 +80,17 @@ def create_jsonl(name, tasks, template):
|
||||
"""Creates a JSONL scenario file with a given name, list of HumanEval tasks, and template path."""
|
||||
|
||||
# Create a task directory if it doesn't exist
|
||||
scenario_dir = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
|
||||
task_dir = os.path.join(scenario_dir, "Tasks")
|
||||
if not os.path.isdir(task_dir):
|
||||
os.mkdir(task_dir)
|
||||
if not os.path.isdir(TASKS_DIR):
|
||||
os.mkdir(TASKS_DIR)
|
||||
|
||||
# Create the jsonl file
|
||||
with open(os.path.join(task_dir, name + ".jsonl"), "wt") as fh:
|
||||
with open(os.path.join(TASKS_DIR, name + ".jsonl"), "wt") as fh:
|
||||
for task in tasks:
|
||||
print(f"Converting: [{name}] {task['task_id']}")
|
||||
|
||||
record = {
|
||||
"id": task["task_id"].replace("/", "_"),
|
||||
"template": os.path.join(os.path.pardir, template),
|
||||
"template": template,
|
||||
"substitutions": {
|
||||
"scenario.py": {
|
||||
"__ENTRY_POINT__": task["entry_point"],
|
||||
@ -102,19 +107,19 @@ def create_jsonl(name, tasks, template):
|
||||
###############################################################################
|
||||
def main():
|
||||
human_eval = download_human_eval()
|
||||
reduced_human_eval = [t for t in human_eval if t["task_id"] in REDUCED_SET]
|
||||
# Deprecated: reduced_human_eval = [t for t in human_eval if t["task_id"] in REDUCED_SET]
|
||||
|
||||
templates = {
|
||||
"two_agents": "Templates/TwoAgents",
|
||||
# "gc3_distractor": "Templates/GroupChatThreeAgents_Distractor",
|
||||
# "gc3_guardrails": "Templates/GroupChatThreeAgents_Guardrails",
|
||||
# "gc4": "Templates/GroupChatFourAgents",
|
||||
}
|
||||
# list all directories in the Templates directory
|
||||
# and populate a dictionary with the name and path
|
||||
templates = {}
|
||||
for entry in os.scandir(TEMPLATES_DIR):
|
||||
if entry.is_dir():
|
||||
templates[re.sub(r"\s", "", entry.name)] = entry.path
|
||||
|
||||
# Create the various combinations of [models] x [templates]
|
||||
for t in templates.items():
|
||||
create_jsonl(f"human_eval_{t[0]}", human_eval, t[1])
|
||||
create_jsonl(f"r_human_eval_{t[0]}", reduced_human_eval, t[1])
|
||||
# Deprecated: create_jsonl(f"r_human_eval_{t[0]}", reduced_human_eval, t[1])
|
||||
|
||||
|
||||
if __name__ == "__main__" and __package__ is None:
|
||||
|
||||
@ -8,6 +8,7 @@ import tarfile
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
URL = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
|
||||
@ -91,7 +92,7 @@ def create_jsonl(name, problems, template):
|
||||
|
||||
record = {
|
||||
"id": task_id,
|
||||
"template": os.path.join(os.path.pardir, template),
|
||||
"template": template,
|
||||
"substitutions": {
|
||||
"prompt.txt": {"__PROMPT__": data["problem"]},
|
||||
"expected_answer.txt": {"__ANSWER__": data["solution"]},
|
||||
@ -105,9 +106,12 @@ def create_jsonl(name, problems, template):
|
||||
def main():
|
||||
problems = download_math()
|
||||
|
||||
templates = {
|
||||
"two_agents": "Templates/TwoAgents",
|
||||
}
|
||||
# list all directories in the Templates directory
|
||||
# and populate a dictionary with the name and path
|
||||
templates = {}
|
||||
for entry in os.scandir(TEMPLATES_DIR):
|
||||
if entry.is_dir():
|
||||
templates[re.sub(r"\s", "", entry.name)] = entry.path
|
||||
|
||||
for t in templates.items():
|
||||
create_jsonl(f"math_{t[0]}", problems, t[1])
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user