Version 0.0.2 of Autogenbench (#1548)

* Prints the version of AutoGenBench from the command line, closing i1458

* Added autogenbench version to timestamp.txt

* Attempting to fix formatting.

* Add a gitignore for autogenbench

* Generalize to read all template dirs from Templates

* AutoGenBench logs telemetry when available.

* Remove spaces if present from template names.

* Bump version.

* Fixed formatting.

* Allow native warning to be skipped. Mount autogen repo in Docker if it can be found (experimental).

* Native execution now occurs in a venv.

* Bump version.

* Fixed a prompt escaping bug evident in GAIA task '6f37996b-2ac7-44b0-8e68-6d28256631b4'

* Updated all scenarios to use template discovery.

* Update with main version of runtime_logging.

---------

Co-authored-by: gagb <gagb@users.noreply.github.com>
This commit is contained in:
afourney 2024-02-24 10:12:57 -08:00 committed by GitHub
parent 477598afff
commit 085bf6cf3d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 202 additions and 53 deletions

3
samples/tools/autogenbench/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
scenarios/*/Downloads
scenarios/*/Tasks
*/Results

View File

@ -1,4 +1,5 @@
import sys
from .version import __version__
from .run_cmd import run_cli
from .clone_cmd import clone_cli
from .tabulate_cmd import tabulate_cli
@ -9,6 +10,7 @@ def main(args=None):
args = sys.argv[:] # Shallow copy
invocation_cmd = "autogenbench"
version_string = f"AutoGenBench version {__version__}"
commands = [
{
@ -26,6 +28,11 @@ def main(args=None):
"description": "tabulate the results of a previous run",
"function": tabulate_cli,
},
{
"command": "--version",
"description": f"print the version of {invocation_cmd}",
"function": lambda _args: print(f"{version_string}"),
},
{"command": "--help", "description": "print this message", "function": None},
]
@ -40,6 +47,8 @@ def main(args=None):
commands_details += f" {padded_cmd}: {c['description']}\n"
usage_text = f"""
{version_string}
usage: {invocation_cmd} COMMAND ARGS
Where, COMMAND is one of: {commands_list}
@ -49,6 +58,8 @@ and ARGS are specific to the command.
""".strip()
help_text = f"""
{version_string}
usage: {invocation_cmd} COMMAND ARGS
{invocation_cmd} is a tool for running and managing AutoGen benchmark scenarios. A typically session might resemble:

View File

@ -11,6 +11,7 @@ import docker
import random
from autogen import config_list_from_json
from autogen.oai.openai_utils import filter_config
from .version import __version__
# Figure out where everything is
SCRIPT_PATH = os.path.realpath(__file__)
@ -247,17 +248,25 @@ def get_scenario_env(config_list, env_file=DEFAULT_ENV_FILE):
Returns: A dictionary of keys and values that need to be added to the system environment.
"""
env = dict()
if os.path.isfile(env_file):
with open(env_file, "rt") as fh:
env = json.loads(fh.read())
config_list_json = json.dumps(config_list)
env["OAI_CONFIG_LIST"] = config_list_json
# Populate with commonly needed keys
openai_api_key = os.environ.get("OPENAI_API_KEY")
if openai_api_key is not None and len(openai_api_key.strip()) > 0:
env["OPENAI_API_KEY"] = openai_api_key
bing_api_key = os.environ.get("BING_API_KEY")
if bing_api_key is not None and len(bing_api_key.strip()) > 0:
env["BING_API_KEY"] = bing_api_key
# Update with any values from the ENV.json file
if os.path.isfile(env_file):
with open(env_file, "rt") as fh:
env.update(json.loads(fh.read()))
# Include the config_list that we are using
config_list_json = json.dumps(config_list)
env["OAI_CONFIG_LIST"] = config_list_json
return env
@ -286,6 +295,12 @@ def run_scenario_natively(work_dir, env, timeout=TASK_TIMEOUT):
f"""#
echo RUN.SH STARTING !#!#
export AUTOGEN_TESTBED_SETTING="Native"
echo "autogenbench version: {__version__}" > timestamp.txt
# Create and activate the virtual environment
# This is called in a subprocess, and will not impact the parent
{sys.executable} -m venv .autogenbench_venv
. .autogenbench_venv/bin/activate
# Run the global init script if it exists
if [ -f global_init.sh ] ; then
@ -298,6 +313,7 @@ if [ -f scenario_init.sh ] ; then
fi
# Run the scenario
pip install -r requirements.txt
echo SCENARIO.PY STARTING !#!#
timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py
EXIT_CODE=$?
@ -312,6 +328,10 @@ if [ -d .cache ] ; then
rm -Rf .cache
fi
if [ -d __pycache__ ] ; then
rm -Rf __pycache__
fi
# Run the scenario finalize script if it exists
if [ -f scenario_finalize.sh ] ; then
. ./scenario_finalize.sh
@ -322,6 +342,12 @@ if [ -f global_finalize.sh ] ; then
. ./global_finalize.sh
fi
# We don't need to deactivate the venv because it's
# contained in the subprocess; but we should clean it up
if [ -d .autogenbench_venv ] ; then
rm -Rf .autogenbench_venv
fi
echo RUN.SH COMPLETE !#!#
"""
)
@ -387,7 +413,9 @@ def run_scenario_in_docker(work_dir, env, timeout=TASK_TIMEOUT, docker_image=Non
f"""#
echo RUN.SH STARTING !#!#
export AUTOGEN_TESTBED_SETTING="Docker"
umask 000
echo "autogenbench version: {__version__}" > timestamp.txt
# Run the global init script if it exists
if [ -f global_init.sh ] ; then
@ -415,6 +443,10 @@ if [ -d .cache ] ; then
rm -Rf .cache
fi
if [ -d __pycache__ ] ; then
rm -Rf __pycache__
fi
# Run the scenario finalize script if it exists
if [ -f scenario_finalize.sh ] ; then
. ./scenario_finalize.sh
@ -429,18 +461,31 @@ echo RUN.SH COMPLETE !#!#
"""
)
print("\n\n" + work_dir + "\n===================================================================")
# Figure out what folders to mount
volumes = {str(pathlib.Path(work_dir).absolute()): {"bind": "/workspace", "mode": "rw"}}
# Add the autogen repo if we can find it
autogen_repo_base = os.environ.get("AUTOGENBENCH_REPO_BASE")
if autogen_repo_base is None:
autogen_repo_base = find_autogen_repo(os.getcwd())
elif not os.path.isdir(autogen_repo_base):
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), autogen_repo_base)
if autogen_repo_base is not None:
volumes[str(pathlib.Path(autogen_repo_base).absolute())] = {"bind": "/autogen", "mode": "rw"}
print("Mounting:")
for k in volumes:
bind = volumes[k]["bind"]
mode = volumes[k]["mode"].upper()
if bind == "/workspace":
k = os.path.relpath(k)
print(f"[{mode}]\t'{k}' => '{bind}'")
print("===================================================================")
# Create and run the container
abs_path = str(pathlib.Path(work_dir).absolute())
container = client.containers.run(
image,
command=["sh", "run.sh"],
working_dir="/workspace",
environment=env,
detach=True,
# get absolute path to the working directory
volumes={abs_path: {"bind": "/workspace", "mode": "rw"}},
image, command=["sh", "run.sh"], working_dir="/workspace", environment=env, detach=True, volumes=volumes
)
# Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
@ -485,6 +530,34 @@ def build_default_docker_image(docker_client, image_tag):
sys.stdout.write(segment["stream"])
def find_autogen_repo(path):
"""
Utility for identifying if the path is a subdirectory of the autogen repo.
Returns: the path to the root of the autogen repo if one is found, otherwise None
"""
# Normalize the path (we expect a directory)
path = os.path.abspath(path)
if os.path.isfile(path):
path = os.path.dirname(path)
while True:
test_path = os.path.join(path, "autogen", "agentchat", "conversable_agent.py") # We found autogen
if os.path.isfile(test_path):
return path
# Stop if we hit the root
parent_dir = os.path.abspath(os.path.join(path, os.pardir))
if parent_dir == path:
break
# Keep searching
path = parent_dir
return None
def run_cli(args):
invocation_cmd = args[0]
args = args[1:]
@ -581,12 +654,23 @@ def run_cli(args):
if parsed_args.requirements is not None:
sys.exit("--requirements is not compatible with --native. Exiting.")
choice = input(
'WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\nAre you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
sys.stderr.write(
"WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n"
)
if choice.strip().lower() != "yes":
sys.exit("Received '" + choice + "'. Exiting.")
# Does an environment variable override the prompt?
allow_native = os.environ.get("AUTOGENBENCH_ALLOW_NATIVE")
if allow_native is None or allow_native == "":
choice = input(
'Are you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
)
if choice.strip().lower() != "yes":
sys.exit("Received '" + choice + "'. Exiting.")
elif allow_native.strip().lower() != "yes":
sys.exit(f"Exiting because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
else:
sys.stderr.write(f"Continuing because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
time.sleep(0.75) # Pause very briefly so the message isn't lost in the noise
# Parse the subsample
subsample = None

View File

@ -6,6 +6,15 @@ import json
AUTOGEN_VERSION = packaging.version.parse(autogen.__version__)
# Try importing the runtime_logging module (only available in some branches)
LOGGING_ENABLED = False
try:
import autogen.runtime_logging
LOGGING_ENABLED = True
except ImportError:
pass
def default_llm_config(config_list, timeout=180):
"""Return a default config list with a given timeout, and with caching disabled.
@ -57,6 +66,10 @@ def init():
if AUTOGEN_VERSION < packaging.version.parse("0.2.0b1"):
autogen.Completion.start_logging(compact=False)
# Start logging
if LOGGING_ENABLED:
autogen.runtime_logging.start(config={"dbname": "telemetry.db"})
def finalize(agents):
"""Helper function to finalize logging in a testbed scenario.
@ -89,3 +102,7 @@ def finalize(agents):
with open(os.path.join(script_dir, "completion_log.json"), "wt") as fh:
fh.write(json.dumps(autogen.Completion.logged_history, indent=4))
autogen.Completion.stop_logging()
# Stop logging
if LOGGING_ENABLED:
autogen.runtime_logging.stop()

View File

@ -1 +1 @@
__version__ = "0.0.1"
__version__ = "0.0.2a4"

View File

@ -47,3 +47,8 @@ exclude = ["*.tests*"]
[project.scripts]
autogenbench = "autogenbench.cli:main"
[tool.black]
# https://github.com/psf/black
line-length = 120
exclude = "(.eggs|.git|.hg|.mypy_cache|.venv|_build|buck-out|build|dist)"

View File

@ -8,6 +8,7 @@ import os
import sys
import glob
import base64
import re
from huggingface_hub import snapshot_download
SCRIPT_PATH = os.path.realpath(__file__)
@ -88,7 +89,12 @@ def create_jsonl(name, template):
###############################################################################
def main():
templates = {"two_agents": os.path.join(TEMPLATES_DIR, "TwoAgents")}
# list all directories in the Templates directory
# and populate a dictionary with the name and path
templates = {}
for entry in os.scandir(TEMPLATES_DIR):
if entry.is_dir():
templates[re.sub(r"\s", "", entry.name)] = entry.path
# Add coding directories if needed (these are usually empty and left out of the repo)
for template in templates.values():

View File

@ -4,9 +4,11 @@
"Scripts/init_tasks.py": "Scripts/init_tasks.py",
"Scripts/custom_tabulate.py": "Scripts/custom_tabulate.py",
"Templates/BasicTwoAgents/expected_answer.txt": "Templates/BasicTwoAgents/expected_answer.txt",
"Templates/BasicTwoAgents/prompt.txt": "Templates/BasicTwoAgents/prompt.txt",
"Templates/BasicTwoAgents/scenario.py": "Templates/BasicTwoAgents/scenario.py",
"Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
"Templates/SocietyOfMind/expected_answer.txt": "Templates/SocietyOfMind/expected_answer.txt",
"Templates/SocietyOfMind/prompt.txt": "Templates/SocietyOfMind/prompt.txt",
"Templates/SocietyOfMind/scenario.py": "Templates/SocietyOfMind/scenario.py",
"Templates/SocietyOfMind/requirements.txt": "Templates/SocietyOfMind/requirements.txt"
}
}

View File

@ -6,6 +6,7 @@
import json
import os
import sys
import re
from huggingface_hub import snapshot_download
SCRIPT_PATH = os.path.realpath(__file__)
@ -60,9 +61,9 @@ def create_jsonl(name, tasks, files_dir, template):
"substitutions": {
"scenario.py": {
"__FILE_NAME__": task["file_name"],
"__PROMPT__": task["Question"],
},
"expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
"prompt.txt": {"__PROMPT__": task["Question"]},
},
}
@ -97,10 +98,12 @@ def main():
gaia_test_tasks[data["Level"] - 1].append(data)
templates = {
"two_agents": os.path.join(TEMPLATES_DIR, "BasicTwoAgents"),
"soc": os.path.join(TEMPLATES_DIR, "SocietyOfMind"),
}
# list all directories in the Templates directory
# and populate a dictionary with the name and path
templates = {}
for entry in os.scandir(TEMPLATES_DIR):
if entry.is_dir():
templates[re.sub(r"\s", "", entry.name)] = entry.path
# Add coding directories if needed (these are usually empty and left out of the repo)
for template in templates.values():

View File

@ -7,6 +7,10 @@ import testbed_utils
testbed_utils.init()
##############################
# Read the prompt
PROMPT = ""
with open("prompt.txt", "rt") as fh:
PROMPT = fh.read().strip()
GAIA_SYSTEM_MESSAGE = (
"You are a helpful AI assistant, and today's date is "
@ -48,9 +52,7 @@ user_proxy = autogen.UserProxyAgent(
)
filename = "__FILE_NAME__".strip()
question = """
__PROMPT__
""".strip()
question = PROMPT
if len(filename) > 0:
question = f"Consider the file '{filename}', which can be read from the current working directory. If you need to read or write it, output python code in a code block (```python) to do so. {question}"

View File

@ -0,0 +1 @@
__PROMPT__

View File

@ -15,6 +15,11 @@ from autogen.token_count_utils import count_token, get_max_token_limit
testbed_utils.init()
##############################
# Read the prompt
PROMPT = ""
with open("prompt.txt", "rt") as fh:
PROMPT = fh.read().strip()
config_list = autogen.config_list_from_json(
"OAI_CONFIG_LIST",
filter_dict={"model": ["gpt-4"]},
@ -46,9 +51,9 @@ def response_preparer(inner_messages):
messages = [
{
"role": "user",
"content": """Earlier you were asked the following:
"content": f"""Earlier you were asked the following:
__PROMPT__
{PROMPT}
Your team then worked diligently to address that request. Here is a transcript of that conversation:""",
}
@ -69,10 +74,10 @@ Your team then worked diligently to address that request. Here is a transcript o
messages.append(
{
"role": "user",
"content": """
"content": f"""
Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
__PROMPT__
{PROMPT}
To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
@ -140,7 +145,7 @@ if len(filename_prompt) > 0:
question = f"""
Below I will pose a question to you that I would like you to answer. You should begin by listing all the relevant facts necessary to derive an answer, then fill in those facts from memory where possible, including specific names, numbers and statistics. You are Ken Jennings-level with trivia, and Mensa-level with puzzles, so there should be a deep well to draw from. After listing the facts, begin to solve the question in earnest. Here is the question:
{filename_prompt}__PROMPT__
{filename_prompt}{PROMPT}
""".strip()
groupchat = GroupChatModerator(

View File

@ -8,6 +8,7 @@ import gzip
import io
import json
import os
import re
import base64
URL = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
@ -16,7 +17,13 @@ SCRIPT_PATH = os.path.realpath(__file__)
SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
# A selected subset of HumanEval problems to work with during development
# Deprecated 2/5/2024 -- Use subsample instead
REDUCED_SET = [
"HumanEval/2",
"HumanEval/26",
@ -73,19 +80,17 @@ def create_jsonl(name, tasks, template):
"""Creates a JSONL scenario file with a given name, list of HumanEval tasks, and template path."""
# Create a task directory if it doesn't exist
scenario_dir = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
task_dir = os.path.join(scenario_dir, "Tasks")
if not os.path.isdir(task_dir):
os.mkdir(task_dir)
if not os.path.isdir(TASKS_DIR):
os.mkdir(TASKS_DIR)
# Create the jsonl file
with open(os.path.join(task_dir, name + ".jsonl"), "wt") as fh:
with open(os.path.join(TASKS_DIR, name + ".jsonl"), "wt") as fh:
for task in tasks:
print(f"Converting: [{name}] {task['task_id']}")
record = {
"id": task["task_id"].replace("/", "_"),
"template": os.path.join(os.path.pardir, template),
"template": template,
"substitutions": {
"scenario.py": {
"__ENTRY_POINT__": task["entry_point"],
@ -102,19 +107,19 @@ def create_jsonl(name, tasks, template):
###############################################################################
def main():
human_eval = download_human_eval()
reduced_human_eval = [t for t in human_eval if t["task_id"] in REDUCED_SET]
# Deprecated: reduced_human_eval = [t for t in human_eval if t["task_id"] in REDUCED_SET]
templates = {
"two_agents": "Templates/TwoAgents",
# "gc3_distractor": "Templates/GroupChatThreeAgents_Distractor",
# "gc3_guardrails": "Templates/GroupChatThreeAgents_Guardrails",
# "gc4": "Templates/GroupChatFourAgents",
}
# list all directories in the Templates directory
# and populate a dictionary with the name and path
templates = {}
for entry in os.scandir(TEMPLATES_DIR):
if entry.is_dir():
templates[re.sub(r"\s", "", entry.name)] = entry.path
# Create the various combinations of [models] x [templates]
for t in templates.items():
create_jsonl(f"human_eval_{t[0]}", human_eval, t[1])
create_jsonl(f"r_human_eval_{t[0]}", reduced_human_eval, t[1])
# Deprecated: create_jsonl(f"r_human_eval_{t[0]}", reduced_human_eval, t[1])
if __name__ == "__main__" and __package__ is None:

View File

@ -8,6 +8,7 @@ import tarfile
import io
import json
import os
import re
import sys
URL = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
@ -91,7 +92,7 @@ def create_jsonl(name, problems, template):
record = {
"id": task_id,
"template": os.path.join(os.path.pardir, template),
"template": template,
"substitutions": {
"prompt.txt": {"__PROMPT__": data["problem"]},
"expected_answer.txt": {"__ANSWER__": data["solution"]},
@ -105,9 +106,12 @@ def create_jsonl(name, problems, template):
def main():
problems = download_math()
templates = {
"two_agents": "Templates/TwoAgents",
}
# list all directories in the Templates directory
# and populate a dictionary with the name and path
templates = {}
for entry in os.scandir(TEMPLATES_DIR):
if entry.is_dir():
templates[re.sub(r"\s", "", entry.name)] = entry.path
for t in templates.items():
create_jsonl(f"math_{t[0]}", problems, t[1])