Initial work porting AutoGenBench to AGNext (#141)

* Initial check-in of agbench

* Moved agbench to intended directory.

* Removed pyautogen dependency

* moved to using process_until_idle

* Added TeamOne template.

* User TeamOne agent classes.

* migrate to hatch, move benchmarks out, add CI (#166)

* Resolve type issues (#168)

Thanks for fixing types.

* Fixed import.

---------

Co-authored-by: Jack Gerrits <jackgerrits@users.noreply.github.com>
This commit is contained in:
afourney 2024-07-02 10:58:49 -07:00 committed by GitHub
parent 766635394a
commit ca42b560e9
30 changed files with 2114 additions and 4 deletions

View File

@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
working-directory: ["./python", "./python/teams/team-one"]
working-directory: ["./python", "./python/teams/team-one", "./python/tools/agbench"]
steps:
- uses: actions/checkout@v4
- name: Install Hatch
@ -25,7 +25,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
working-directory: ["./python", "./python/teams/team-one"]
working-directory: ["./python", "./python/teams/team-one", "./python/tools/agbench"]
steps:
- uses: actions/checkout@v4
- name: Install Hatch
@ -37,7 +37,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
working-directory: ["./python", "./python/teams/team-one"]
working-directory: ["./python", "./python/teams/team-one", "./python/tools/agbench"]
steps:
- uses: actions/checkout@v4
- name: Install Hatch
@ -49,7 +49,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
working-directory: ["./python", "./python/teams/team-one"]
working-directory: ["./python", "./python/teams/team-one", "./python/tools/agbench"]
steps:
- uses: actions/checkout@v4
- name: Install Hatch

View File

@ -0,0 +1,21 @@
# HumanEval Benchmark
This scenario implements a modified version of the [HumanEval](https://arxiv.org/abs/2107.03374) benchmark.
Compared to the original benchmark, there are **two key differences** here:
- A chat model rather than a completion model is used.
- The agents get pass/fail feedback about their implementations, and can keep trying until they succeed or run out of tokens or turns.
## Running the tasks
```
autogenbench run Tasks/human_eval_two_agents.jsonl
autogenbench tabulate Results/human_eval_two_agents
```
For faster development and iteration, a reduced HumanEval set is available via `Tasks/r_human_eval_two_agents.jsonl`, and contains only 26 problems of varying difficulty.
## References
**Evaluating Large Language Models Trained on Code**<br/>
Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, Alex Ray, Raul Puri, Gretchen Krueger, Michael Petrov, Heidy Khlaaf, Girish Sastry, Pamela Mishkin, Brooke Chan, Scott Gray, Nick Ryder, Mikhail Pavlov, Alethea Power, Lukasz Kaiser, Mohammad Bavarian, Clemens Winter, Philippe Tillet, Felipe Petroski Such, Dave Cummings, Matthias Plappert, Fotios Chantzis, Elizabeth Barnes, Ariel Herbert-Voss, William Hebgen Guss, Alex Nichol, Alex Paino, Nikolas Tezak, Jie Tang, Igor Babuschkin, Suchir Balaji, Shantanu Jain, William Saunders, Christopher Hesse, Andrew N. Carr, Jan Leike, Josh Achiam, Vedant Misra, Evan Morikawa, Alec Radford, Matthew Knight, Miles Brundage, Mira Murati, Katie Mayer, Peter Welinder, Bob McGrew, Dario Amodei, Sam McCandlish, Ilya Sutskever, Wojciech Zaremba<br/>
[https://arxiv.org/abs/2107.03374](https://arxiv.org/abs/2107.03374)

View File

@ -0,0 +1,12 @@
import os
import sys
from agbench.tabulate_cmd import default_tabulate
def main(args):
default_tabulate(args)
if __name__ == "__main__" and __package__ is None:
main(sys.argv)

View File

@ -0,0 +1,124 @@
#
# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:
# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)
#
import base64
import gzip
import io
import json
import os
import re
import requests
URL = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
SCRIPT_PATH = os.path.realpath(__file__)
SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
# A selected subset of HumanEval problems to work with during development
# Deprecated 2/5/2024 -- Use subsample instead
REDUCED_SET = [
"HumanEval/2",
"HumanEval/26",
"HumanEval/32",
"HumanEval/33",
"HumanEval/36",
"HumanEval/38",
"HumanEval/41",
"HumanEval/50",
"HumanEval/56",
"HumanEval/65",
"HumanEval/67",
"HumanEval/84",
"HumanEval/85",
"HumanEval/86",
"HumanEval/89",
"HumanEval/99",
"HumanEval/104",
"HumanEval/113",
"HumanEval/115",
"HumanEval/120",
"HumanEval/124",
"HumanEval/126",
"HumanEval/132",
"HumanEval/135",
"HumanEval/140",
"HumanEval/146",
]
def download_human_eval():
"""Download the HumanEval dataset, un-gzips it, and returns a list of its parsed JSON objects."""
# Send a HTTP request to the URL of the file
response = requests.get(URL)
# Ensure we raise an error if the download failed
response.raise_for_status()
# Create a BytesIO object from the response content
buffer = io.BytesIO(response.content)
# Read the file, line by line, populating a list of parsed JSON objects
results = []
with gzip.GzipFile(fileobj=buffer) as f_in:
for line in f_in:
# Parse each line as JSON
results.append(json.loads(line))
return results
def create_jsonl(name, tasks, template):
"""Creates a JSONL scenario file with a given name, list of HumanEval tasks, and template path."""
# Create a task directory if it doesn't exist
if not os.path.isdir(TASKS_DIR):
os.mkdir(TASKS_DIR)
# Create the jsonl file
with open(os.path.join(TASKS_DIR, name + ".jsonl"), "wt") as fh:
for task in tasks:
print(f"Converting: [{name}] {task['task_id']}")
record = {
"id": task["task_id"].replace("/", "_"),
"template": template,
"substitutions": {
"scenario.py": {"__ENTRY_POINT__": task["entry_point"]},
"prompt.txt": {"__PROMPT__": task["prompt"]},
"unit_tests.py": {"__TEST__": task["test"]},
},
}
fh.write(json.dumps(record).strip() + "\n")
###############################################################################
def main():
human_eval = download_human_eval()
# Deprecated: reduced_human_eval = [t for t in human_eval if t["task_id"] in REDUCED_SET]
# list all directories in the Templates directory
# and populate a dictionary with the name and path
templates = {}
for entry in os.scandir(TEMPLATES_DIR):
if entry.is_dir():
templates[re.sub(r"\s", "", entry.name)] = entry.path
# Create the various combinations of [models] x [templates]
for t in templates.items():
create_jsonl(f"human_eval_{t[0]}", human_eval, t[1])
# Deprecated: create_jsonl(f"r_human_eval_{t[0]}", reduced_human_eval, t[1])
if __name__ == "__main__" and __package__ is None:
main()

View File

@ -0,0 +1 @@
__PROMPT__

View File

@ -0,0 +1 @@
/agnext/teams/team-one

View File

@ -0,0 +1,79 @@
import asyncio
#from typing import Any, Dict, List, Tuple, Union
from agnext.application import SingleThreadedAgentRuntime
from agnext.components.models import (
AzureOpenAIChatCompletionClient,
LLMMessage,
ModelCapabilities,
UserMessage,
)
from agnext.components.code_executor import LocalCommandLineCodeExecutor
from team_one.agents.coder import Coder, Executor
from team_one.agents.orchestrator import RoundRobinOrchestrator
from team_one.messages import BroadcastMessage
async def main() -> None:
# Create the runtime.
runtime = SingleThreadedAgentRuntime()
# Create the AzureOpenAI client, with AAD auth
#token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
client = AzureOpenAIChatCompletionClient(
api_version="2024-02-15-preview",
azure_endpoint="https://aif-complex-tasks-west-us-3.openai.azure.com/",
model="gpt-4o-2024-05-13",
model_capabilities=ModelCapabilities(function_calling=True, json_output=True, vision=True),
#azure_ad_token_provider=token_provider
)
# Register agents.
coder = runtime.register_and_get_proxy(
"Coder",
lambda: Coder(model_client=client),
)
executor = runtime.register_and_get_proxy(
"Executor",
lambda: Executor("A agent for executing code", executor=LocalCommandLineCodeExecutor())
)
runtime.register("orchestrator", lambda: RoundRobinOrchestrator([coder, executor]))
prompt = ""
with open("prompt.txt", "rt") as fh:
prompt = fh.read()
entry_point = "__ENTRY_POINT__"
task = f"""
The following python code imports the `run_tests` function from unit_tests.py, and runs
it on the function `{entry_point}`. This will run a set of automated unit tests to verify the
correct implementation of `{entry_point}`. However, `{entry_point}` is only partially
implemented in the code below. Complete the implementation of `{entry_point}` and then execute
a new stand-alone code block that contains everything needed to run the tests, including: importing
`unit_tests`, calling `run_tests({entry_point})`, as well as {entry_point}'s complete definition,
such that this code block can be run directly in Python.
```python
from unit_tests import run_tests
{prompt}
# Run the unit tests
run_tests({entry_point})
```
""".strip()
await runtime.publish_message(BroadcastMessage(content=UserMessage(content=task, source="human")), namespace="default")
# Run the runtime until the task is completed.
await runtime.process_until_idle()
if __name__ == "__main__":
import logging
logging.basicConfig(level=logging.WARNING)
logging.getLogger("agnext").setLevel(logging.DEBUG)
asyncio.run(main())

View File

@ -0,0 +1,15 @@
# Disable ruff linter for template files
# ruff: noqa: F821 E722
import sys
__TEST__
def run_tests(candidate):
try:
check(candidate)
# We can search for this string in the output
print("ALL TESTS PASSED !#!#")
except AssertionError:
sys.exit("SOME TESTS FAILED - TRY AGAIN !#!#")

View File

@ -0,0 +1 @@
__PROMPT__

View File

@ -0,0 +1 @@
/agnext

View File

@ -0,0 +1,213 @@
import asyncio
import json
import re
import uuid
from dataclasses import dataclass
from typing import Any, Dict, List, Tuple, Union
from agnext.application import SingleThreadedAgentRuntime
from agnext.components import FunctionCall, TypeRoutedAgent, message_handler
from agnext.components.code_executor import CodeBlock, CodeExecutor, LocalCommandLineCodeExecutor
from agnext.components.models import (
AssistantMessage,
AzureOpenAIChatCompletionClient,
ChatCompletionClient,
FunctionExecutionResult,
FunctionExecutionResultMessage,
LLMMessage,
ModelCapabilities,
OpenAIChatCompletionClient,
SystemMessage,
UserMessage,
)
from agnext.components.tools import CodeExecutionResult, PythonCodeExecutionTool
from agnext.core import AgentId, CancellationToken
#from azure.identity import DefaultAzureCredential, get_bearer_token_provider
@dataclass
class TaskMessage:
content: str
@dataclass
class CodeExecutionRequestMessage:
session_id: str
execution_request: str
@dataclass
class CodeExecutionResultMessage:
session_id: str
output: str
exit_code: int
class Coder(TypeRoutedAgent):
"""An agent that uses tools to write, execute, and debug Python code."""
DEFAULT_DESCRIPTION = "A Python coder assistant."
DEFAULT_SYSTEM_MESSAGES = [
SystemMessage("""You are a helpful AI assistant. Solve tasks using your Python coding skills. The code you output must be formatted in Markdown code blocks demarcated by triple backticks (```). As an example:
```python
def main():
print("Hello world.")
if __name__ == "__main__":
main()
```
The user cannot provide any feedback or perform any other action beyond executing the code you suggest. In particular, the user can't modify your code, and can't copy and paste anything, and can't fill in missing values. Thus, do not suggest incomplete code which requires users to perform any of these actions.
Check the execution result returned by the user. If the result indicates there is an error, fix the error and output the code again. Suggest the full code instead of partial code or code changes -- code blocks must stand alone and be ready to execute without modification. If the error can't be fixed or if the task is not solved even after the code is executed successfully, analyze the problem, revisit your assumption, and think of a different approach to try.
If the code has executed successfully, and the problem is stolved, reply "TERMINATE".
""")
]
def __init__(
self,
model_client: ChatCompletionClient,
description: str = DEFAULT_DESCRIPTION,
system_messages: List[SystemMessage] = DEFAULT_SYSTEM_MESSAGES,
max_turns: int | None = None,
) -> None:
super().__init__(description)
self._model_client = model_client
self._system_messages = system_messages
self._session_memory: Dict[str, List[LLMMessage]] = {}
self._max_turns = max_turns
@message_handler
async def handle_user_message(
self, message: TaskMessage, cancellation_token: CancellationToken
) -> None:
"""Handle a user message, execute the model and tools, and returns the response."""
# Create a new session.
session_id = str(uuid.uuid4())
self._session_memory.setdefault(session_id, []).append(UserMessage(content=message.content, source="user"))
# Make an inference to the model.
response = await self._model_client.create(self._system_messages + self._session_memory[session_id])
assert isinstance(response.content, str)
self._session_memory[session_id].append(AssistantMessage(content=response.content, source=self.metadata["name"]))
await self.publish_message(CodeExecutionRequestMessage(execution_request=response.content, session_id=session_id), cancellation_token=cancellation_token)
@message_handler
async def handle_code_execution_result(self, message: CodeExecutionResultMessage, cancellation_token: CancellationToken) -> None:
execution_result = f"The script ran, then exited with Unix exit code: {message.exit_code}\nIts output was:\n{message.output}"
# Store the code execution output.
self._session_memory[message.session_id].append(UserMessage(content=execution_result, source="user"))
# Count the number of rounds so far
if self._max_turns is not None:
n_turns = sum(1 for message in self._session_memory[message.session_id] if isinstance(message, AssistantMessage))
if n_turns >= self._max_turns:
return
# Make an inference to the model.
response = await self._model_client.create(self._system_messages + self._session_memory[message.session_id])
assert isinstance(response.content, str)
self._session_memory[message.session_id].append(AssistantMessage(content=response.content, source=self.metadata["name"]))
if "TERMINATE" in response.content:
return
else:
await self.publish_message(CodeExecutionRequestMessage(execution_request=response.content, session_id=message.session_id), cancellation_token=cancellation_token)
class Executor(TypeRoutedAgent):
def __init__(self, description: str, executor: CodeExecutor) -> None:
super().__init__(description)
self._executor = executor
@message_handler
async def handle_code_execution(self, message: CodeExecutionRequestMessage, cancellation_token: CancellationToken) -> None:
# Extract code block from the message.
code = self._extract_execution_request(message.execution_request)
if code is not None:
execution_requests = [CodeBlock(code=code, language="python")]
future = asyncio.get_event_loop().run_in_executor(None, self._executor.execute_code_blocks, execution_requests)
cancellation_token.link_future(future)
result = await future
await self.publish_message(CodeExecutionResultMessage(output=result.output, exit_code=result.exit_code, session_id=message.session_id))
else:
await self.publish_message(CodeExecutionResultMessage(output="No code block detected. Please provide a markdown-encoded code block to execute.", exit_code=1, session_id=message.session_id))
def _extract_execution_request(self, markdown_text: str) -> Union[str, None]:
pattern = r"```(\w+)\n(.*?)\n```"
# Search for the pattern in the markdown text
match = re.search(pattern, markdown_text, re.DOTALL)
# Extract the language and code block if a match is found
if match:
return match.group(2)
return None
async def main() -> None:
# Create the runtime.
runtime = SingleThreadedAgentRuntime()
# Create the AzureOpenAI client, with AAD auth
#token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
client = AzureOpenAIChatCompletionClient(
api_version="2024-02-15-preview",
azure_endpoint="https://aif-complex-tasks-west-us-3.openai.azure.com/",
model="gpt-4o-2024-05-13",
model_capabilities=ModelCapabilities(function_calling=True, json_output=True, vision=True),
#azure_ad_token_provider=token_provider
)
# Register agents.
coder = runtime.register_and_get(
"Coder",
lambda: Coder(model_client=client),
)
runtime.register(
"Executor",
lambda: Executor("A agent for executing code", executor=LocalCommandLineCodeExecutor())
)
prompt = ""
with open("prompt.txt", "rt") as fh:
prompt = fh.read()
entry_point = "__ENTRY_POINT__"
task = TaskMessage(f"""
The following python code imports the `run_tests` function from unit_tests.py, and runs
it on the function `{entry_point}`. This will run a set of automated unit tests to verify the
correct implementation of `{entry_point}`. However, `{entry_point}` is only partially
implemented in the code below. Complete the implementation of `{entry_point}` and then execute
a new stand-alone code block that contains everything needed to run the tests, including: importing
`unit_tests`, calling `run_tests({entry_point})`, as well as {entry_point}'s complete definition,
such that this code block can be run directly in Python.
```python
from unit_tests import run_tests
{prompt}
# Run the unit tests
run_tests({entry_point})
```
""".strip())
# Send a task to the tool user.
await runtime.send_message(task, coder)
# Run the runtime until the task is completed.
await runtime.process_until_idle()
if __name__ == "__main__":
import logging
logging.basicConfig(level=logging.WARNING)
logging.getLogger("agnext").setLevel(logging.DEBUG)
asyncio.run(main())

View File

@ -0,0 +1,15 @@
# Disable ruff linter for template files
# ruff: noqa: F821 E722
import sys
__TEST__
def run_tests(candidate):
try:
check(candidate)
# We can search for this string in the output
print("ALL TESTS PASSED !#!#")
except AssertionError:
sys.exit("SOME TESTS FAILED - TRY AGAIN !#!#")

View File

@ -84,6 +84,24 @@ build = "sphinx-build docs/src docs/build"
serve = "sphinx-autobuild --watch src docs/src docs/build"
check = "sphinx-build --fail-on-warning docs/src docs/build"
# Benchmark environments
[tool.hatch.envs.bench-humaneval-teamone]
installer = "uv"
detached = true
dependencies = [
"agnext@{root:uri}",
"agbench@{root:uri}/tools/agbench",
"team-one@{root:uri}/teams/team-one",
]
[tool.hatch.envs.bench-humaneval-twoagents]
installer = "uv"
detached = true
dependencies = [
"agnext@{root:uri}",
"agbench@{root:uri}/tools/agbench",
]
[tool.ruff]
line-length = 120
fix = true

3
python/tools/agbench/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
scenarios/*/Downloads
scenarios/*/Tasks
*/Results

View File

@ -0,0 +1,188 @@
# Contributing to AutoGenBench
As part of the broader AutoGen project, AutoGenBench welcomes community contributions. Contributions are subject to AutoGen's [contribution guidelines](https://microsoft.github.io/autogen/docs/Contribute), as well as a few additional AutoGenBench-specific requirements outlined here. You may also wish to develop your own private benchmark scenarios and the guidance in this document will help with such efforts as well. Below you will find the general requirements, followed by a detailed technical description.
## General Contribution Requirements
We ask that all contributions to AutoGenBench adhere to the following:
- Follow AutoGen's broader [contribution guidelines](https://microsoft.github.io/autogen/docs/Contribute)
- All AutoGenBench benchmarks should live in a subfolder of `/samples/tools/autogenbench/scenarios` alongside `HumanEval`, `GAIA`, etc.
- Benchmark scenarios should include a detailed README.md, in the root of their folder, describing the benchmark and providing citations where warranted.
- Benchmark data (tasks, ground truth, etc.) should be downloaded from their original sources rather than hosted in the AutoGen repository (unless the benchmark is original, and the repository *is* the original source)
- You can use the `Scripts/init_tasks.py` file to automate this download.
- Basic scoring should be compatible with the `autogenbench tabulate` command (e.g., by outputting logs compatible with the default tabulation mechanism, or by providing a `Scripts/custom_tabulate.py` file)
- If you wish your benchmark to be compatible with the `autogenbench clone` command, include a `MANIFEST.json` file in the root of your folder.
These requirements are further detailed below, but if you simply copy the `HumanEval` folder, you will already be off to a great start.
## Implementing and Running Benchmark Tasks
At the core of any benchmark is a set of tasks. To implement tasks that are runnable by AutoGenBench, you must adhere to AutoGenBench's templating and scenario expansion algorithms, as outlined below.
### Task Definitions
All tasks are stored in JSONL files (in subdirectories under `./Tasks`). Each line of a tasks file is a JSON object with the following schema:
```
{
"id": string,
"template": dirname,
"substitutions" {
"filename1": {
"find_string1_1": replace_string1_1,
"find_string1_2": replace_string1_2,
...
"find_string1_M": replace_string1_N
}
"filename2": {
"find_string2_1": replace_string2_1,
"find_string2_2": replace_string2_2,
...
"find_string2_N": replace_string2_N
}
}
}
```
For example:
```
{
"id": "two_agent_stocks_gpt4",
"template": "default_two_agents",
"substitutions": {
"scenario.py": {
"__MODEL__": "gpt-4",
},
"prompt.txt": {
"__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD."
}
}
}
```
In this example, the string `__MODEL__` will be replaced in the file `scenarios.py`, while the string `__PROMPT__` will be replaced in the `prompt.txt` file.
The `template` field can also take on a list value, but this usage is considered advanced and is not described here. See the `autogenbench/run_cmd.py` code, or the `GAIA` benchmark tasks files for additional information about this option.
## Task Instance Expansion Algorithm
Once the tasks have been defined, as per above, they must be "instantiated" before they can be run. This instantiation happens automatically when the user issues the `autogenbench run` command and involves creating a local folder to share with Docker. Each instance and repetition gets its own folder along the path: `./results/[scenario]/[task_id]/[instance_id]`. For the sake of brevity we will refer to this folder as the `DEST_FOLDER`.
The algorithm for populating the `DEST_FOLDER` is as follows:
1. Pre-populate DEST_FOLDER with all the basic starter files for running a scenario (found in `autogenbench/template`).
2. Recursively copy the template folder specified in the JSONL line to DEST_FOLDER (if the JSON `template` attribute points to a folder) If the JSONs `template` attribute instead points to a file, copy the file, but rename it to `scenario.py`
3. Apply any string replacements, as outlined in the prior section.
4. Write a run.sh file to DEST_FOLDER that will be executed by Docker when it is loaded. The `run.sh` is described below.
## Scenario Execution Algorithm
Once the task has been instantiated it is run (via run.sh). This script will execute the following steps:
1. If a file named `global_init.sh` is present, run it.
2. If a file named `scenario_init.sh` is present, run it.
3. Install the requirements.txt file (if running in Docker)
4. Run the task via `python scenario.py`
5. If the scenario.py exited cleanly (exit code 0), then print "SCENARIO.PY COMPLETE !#!#"
6. Clean up (delete cache, etc.)
7. If a file named `scenario_finalize.sh` is present, run it.
8. If a file named `global_finalize.sh` is present, run it.
9. echo "RUN COMPLETE !#!#", signaling that all steps completed.
Notably, this means that scenarios can add custom init and teardown logic by including `scenario_init.sh` and `scenario_finalize.sh` files.
At the time of this writing, the run.sh file is as follows:
```sh
export AUTOGEN_TESTBED_SETTING="Docker"
umask 000
# Run the global init script if it exists
if [ -f global_init.sh ] ; then
. ./global_init.sh
fi
# Run the scenario init script if it exists
if [ -f scenario_init.sh ] ; then
. ./scenario_init.sh
fi
# Run the scenario
pip install -r requirements.txt
python scenario.py
EXIT_CODE=$?
if [ $EXIT_CODE -ne 0 ]; then
echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
else
echo SCENARIO.PY COMPLETE !#!#
fi
# Clean up
if [ -d .cache ] ; then
rm -Rf .cache
fi
# Run the scenario finalize script if it exists
if [ -f scenario_finalize.sh ] ; then
. ./scenario_finalize.sh
fi
# Run the global finalize script if it exists
if [ -f global_finalize.sh ] ; then
. ./global_finalize.sh
fi
echo RUN.SH COMPLETE !#!#
```
Be warned that this listing is provided here for illustration purposes, and may vary over time. The source of truth are the `run.sh` files found in the ``./results/[taskset]/[task_id]/[instance_id]`` folders.
## Integrating with the `tabulate` and `clone` commands.
The above details are sufficient for defining and running tasks, but if you wish to support the `autogenbench tabulate` and `autogenbench clone` commands, a few additional steps are required.
### Tabulations
If you wish to leverage the default tabulation logic, it is as simple as arranging your `scenario.py` file to output the string "ALL TESTS PASSED !#!#" to the console in the event that a task was solved correctly.
If you wish to implement your own tabulation logic, simply create the file `Scripts/custom_tabulate.py` and include a `main(args)` method. Here, the `args` parameter will be provided by AutoGenBench, and is a drop-in replacement for `sys.argv`. In particular, `args[0]` will be the invocation command (similar to the executable or script name in `sys.argv`), and the remaining values (`args[1:]`) are the command line parameters.
Should you provide a custom tabulation script, please implement `--help` and `-h` options for documenting your interface.
The `scenarios/GAIA/Scripts/custom_tabulate.py` is a great example of custom tabulation. It also shows how you can reuse some components of the default tabulator to speed up development.
### Cloning
If you wish your benchmark to be available via the `autogenbench clone` command, you will need to take three additional steps:
#### Manifest
First, provide a `MANIFEST.json` file in the root of your benchmark. An example is provided below, from which you can see the schema:
```json
{
"files": {
"Templates/TwoAgents/prompt.txt": "Templates/TwoAgents/prompt.txt",
"Templates/TwoAgents/coding/my_tests.py": "Templates/TwoAgents/coding/my_tests.py",
"Templates/TwoAgents/scenario.py": "Templates/TwoAgents/scenario.py",
"README.md": "README.md",
"Scripts/init_tasks.py": "Scripts/init_tasks.py",
"Scripts/custom_tabulate.py": "Scripts/custom_tabulate.py"
}
}
```
The keys of the `files` dictionary are local paths, relative to your benchmark's root directory. The values are relative paths in the AutoGen GitHub repository (relative to the folder where the MANIFEST.json file is located). In most cases, the keys and values will be identical.
#### SCENARIOS dictionary
Second, you must add an entry to the `scenarios` dictionary in `autogen/samples/tools/autogenbench/scenarios/MANIFEST.json`.
#### Scripts/init_tasks.py
Finally, you should provide an `Scripts/init_tasks.py` file, in your benchmark folder, and include a `main()` method therein. This method will be loaded and called automatically by `autogenbench clone` after all manifest files have been downloaded.
This `init_tasks.py` script is a great place to download benchmarks from their original sources and convert them to the JSONL format required by AutoGenBench:
- See `HumanEval/Scripts/init_tasks.py` for an example of how to expand a benchmark from an original GitHub repository.
- See `GAIA/Scripts/init_tasks.py` for an example of how to expand a benchmark from `Hugging Face Hub`.
- See `MATH/SCripts/init_tasks.py` for an example of how to expand a benchmark from an author-hosted website.

View File

@ -0,0 +1,4 @@
recursive-exclude scenarios *
recursive-exclude results *
recursive-exclude tests *
recursive-exclude utils *

View File

@ -0,0 +1,174 @@
# AutoGenBench
AutoGenBench is a tool for repeatedly running a set of pre-defined AutoGen tasks in a setting with tightly-controlled initial conditions. With each run, AutoGenBench will start from a blank slate. The agents being evaluated will need to work out what code needs to be written, and what libraries or dependencies to install, to solve tasks. The results of each run are logged, and can be ingested by analysis or metrics scripts (such as `autogenbench tabulate`). By default, all runs are conducted in freshly-initialized docker containers, providing the recommended level of consistency and safety.
AutoGenBench works with all AutoGen 0.1.*, and 0.2.* versions.
## Technical Specifications
If you are already an AutoGenBench pro, and want the full technical specifications, please review the [contributor's guide](CONTRIBUTING.md).
## Docker Requirement
AutoGenBench also requires Docker (Desktop or Engine). **It will not run in GitHub codespaces**, unless you opt for native execution (with is strongly discouraged). To install Docker Desktop see [https://www.docker.com/products/docker-desktop/](https://www.docker.com/products/docker-desktop/).
## Installation and Setup
**To get the most out of AutoGenBench, the `autogenbench` package should be installed**. At present, the easiest way to do this is to install it via `pip`:
```
pip install autogenbench
```
If you would prefer working from source code (e.g., for development, or to utilize an alternate branch), simply clone the [AutoGen](https://github.com/microsoft/autogen) repository, then install `autogenbench` via:
```
pip install -e autogen/samples/tools/autogenbench
```
After installation, you must configure your API keys. As with other AutoGen applications, AutoGenBench will look for the OpenAI keys in the OAI_CONFIG_LIST file in the current working directory, or the OAI_CONFIG_LIST environment variable. This behavior can be overridden using a command-line parameter described later.
If you will be running multiple benchmarks, it is often most convenient to leverage the environment variable option. You can load your keys into the environment variable by executing:
```
export OAI_CONFIG_LIST=$(cat ./OAI_CONFIG_LIST)
```
If an OAI_CONFIG_LIST is *not* provided (by means of file or environment variable), AutoGenBench will use the OPENAI_API_KEY environment variable instead.
For some benchmark scenarios, additional keys may be required (e.g., keys for the Bing Search API). These can be added to an `ENV.json` file in the current working folder. An example `ENV.json` file is provided below:
```
{
"BING_API_KEY": "xxxyyyzzz"
}
```
## A Typical Session
Once AutoGenBench and necessary keys are installed, a typical session will look as follows:
```
autogenbench clone HumanEval
cd HumanEval
autogenbench run Tasks/r_human_eval_two_agents.jsonl
autogenbench tabulate results/r_human_eval_two_agents
```
Where:
- `autogenbench clone HumanEval` downloads and expands the HumanEval benchmark scenario.
- `autogenbench run Tasks/r_human_eval_two_agents.jsonl` runs the tasks defined in `Tasks/r_human_eval_two_agents.jsonl`
- `autogenbench tablue results/r_human_eval_two_agents` tabulates the results of the run
Each of these commands has extensive in-line help via:
- `autogenbench --help`
- `autogenbench clone --help`
- `autogenbench run --help`
- `autogenbench tabulate --help`
**NOTE:** If you are running `autogenbench` from within the repository, you dont need to run `autogenbench clone`. Instead, navigate to the appropriate scenario folder (e.g., `scenarios/HumanEval`) and run the `Scripts/init_tasks.py` file.
More details of each command are provided in the sections that follow.
## Cloning Benchmarks
To clone an existing benchmark, simply run:
```
autogenbench clone [BENCHMARK]
```
For example,
```
autogenbench clone HumanEval
```
To see which existing benchmarks are available to clone, run:
```
autogenbench clone --list
```
> Note: You might need to log in to HuggingFace to access certain datasets like GAIA. To do this, run `huggingface-cli login` in your terminal and follow the prompts.
## Running AutoGenBench
To run a benchmark (which executes the tasks, but does not compute metrics), simply execute:
```
cd [BENCHMARK]
autogenbench run Tasks
```
For example,
```
cd HumanEval
autogenbench run Tasks
```
The default is to run each task once. To run each scenario 10 times, use:
```
autogenbench run --repeat 10 Tasks
```
The `autogenbench` command-line tool allows a number of command-line arguments to control various parameters of execution. Type ``autogenbench -h`` to explore these options:
```
'autogenbench run' will run the specified autogen scenarios for a given number of repetitions and record all logs and trace information. When running in a Docker environment (default), each run will begin from a common, tightly controlled, environment. The resultant logs can then be further processed by other scripts to produce metrics.
positional arguments:
scenario The JSONL scenario file to run. If a directory is specified,
then all JSONL scenarios in the directory are run. (default:
./scenarios)
options:
-h, --help show this help message and exit
-c CONFIG, --config CONFIG
The environment variable name or path to the OAI_CONFIG_LIST (default: OAI_CONFIG_LIST).
-r REPEAT, --repeat REPEAT
The number of repetitions to run for each scenario (default: 1).
-s SUBSAMPLE, --subsample SUBSAMPLE
Run on a subsample of the tasks in the JSONL file(s). If a decimal value is specified, then run on
the given proportion of tasks in each file. For example "0.7" would run on 70% of tasks, and "1.0"
would run on 100% of tasks. If an integer value is specified, then randomly select *that* number of
tasks from each specified JSONL file. For example "7" would run tasks, while "1" would run only 1
task from each specified JSONL file. (default: 1.0; which is 100%)
-m MODEL, --model MODEL
Filters the config_list to include only models matching the provided model name (default: None, which
is all models).
--requirements REQUIREMENTS
The requirements file to pip install before running the scenario.
-d DOCKER_IMAGE, --docker-image DOCKER_IMAGE
The Docker image to use when running scenarios. Can not be used together with --native. (default:
'autogenbench:default', which will be created if not present)
--native Run the scenarios natively rather than in docker. NOTE: This is not advisable, and should be done
with great caution.
```
## Results
By default, the AutoGenBench stores results in a folder hierarchy with the following template:
``./results/[scenario]/[task_id]/[instance_id]``
For example, consider the following folders:
``./results/default_two_agents/two_agent_stocks/0``
``./results/default_two_agents/two_agent_stocks/1``
...
``./results/default_two_agents/two_agent_stocks/9``
This folder holds the results for the ``two_agent_stocks`` task of the ``default_two_agents`` tasks file. The ``0`` folder contains the results of the first instance / run. The ``1`` folder contains the results of the second run, and so on. You can think of the _task_id_ as mapping to a prompt, or a unique set of parameters, while the _instance_id_ defines a specific attempt or run.
Within each folder, you will find the following files:
- *timestamp.txt*: records the date and time of the run, along with the version of the pyautogen library installed
- *console_log.txt*: all console output produced by Docker when running AutoGen. Read this like you would a regular console.
- *[agent]_messages.json*: for each Agent, a log of their messages dictionaries
- *./coding*: A directory containing all code written by AutoGen, and all artifacts produced by that code.
## Contributing or Defining New Tasks or Benchmarks
If you would like to develop -- or even contribute -- your own tasks or benchmarks, please review the [contributor's guide](CONTRIBUTING.md) for complete technical details.

View File

@ -0,0 +1,96 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "agbench"
dynamic = ["version"]
authors = [
{ name="Adam Fourney", email="adamfo@microsoft.com" },
]
description = "AGNext Benchmarking Tools"
readme = "README.md"
requires-python = ">=3.8, <3.13"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
dependencies = [
"openai",
"docker",
"huggingface_hub",
"tabulate",
"azure-identity",
]
[tool.hatch.envs.default]
installer = "uv"
dependencies = [
"pyright==1.1.368",
"mypy==1.10.0",
"ruff==0.4.8",
"types-tabulate",
"types-docker"
]
[tool.hatch.envs.default.extra-scripts]
pip = "{env:HATCH_UV} pip {args}"
[tool.hatch.envs.default.scripts]
fmt = "ruff format"
lint = "ruff check"
check = [
"ruff format",
"ruff check --fix",
"pyright",
"mypy",
]
[tool.hatch.version]
path = "src/agbench/version.py"
[project.scripts]
agbench = "agbench.cli:main"
[tool.ruff]
line-length = 120
fix = true
exclude = ["build", "dist", "src/agbench/res/*", "src/agbench/template/*",]
target-version = "py310"
include = ["src/**"]
[tool.ruff.lint]
select = ["E", "F", "W", "B", "Q", "I", "ASYNC"]
ignore = ["F401", "E501"]
[tool.ruff.lint.flake8-tidy-imports]
[tool.ruff.lint.flake8-tidy-imports.banned-api]
"unittest".msg = "Use `pytest` instead."
[tool.mypy]
files = ["src"]
strict = true
python_version = "3.10"
ignore_missing_imports = true
# from https://blog.wolt.com/engineering/2021/09/30/professional-grade-mypy-configuration/
disallow_untyped_defs = true
no_implicit_optional = true
check_untyped_defs = true
warn_return_any = true
show_error_codes = true
warn_unused_ignores = false
disallow_incomplete_defs = true
disallow_untyped_decorators = true
disallow_any_unimported = true
[tool.pyright]
include = ["src"]
typeCheckingMode = "strict"
reportUnnecessaryIsInstance = false
reportMissingTypeStubs = false

View File

@ -0,0 +1,3 @@
from setuptools import setup
setup()

View File

@ -0,0 +1 @@
from .version import __version__

View File

@ -0,0 +1,4 @@
from .cli import main
if __name__ == "__main__":
main()

View File

@ -0,0 +1,108 @@
import sys
from typing import Callable, List, Optional, Sequence
from typing_extensions import TypedDict
from .run_cmd import run_cli
from .tabulate_cmd import tabulate_cli
from .version import __version__
class CommandSpec(TypedDict):
command: str
description: str
function: Optional[Callable[[Sequence[str]], None]]
def main(args: Optional[List[str]] = None) -> None:
if args is None:
args = sys.argv[:] # Shallow copy
invocation_cmd = "autogenbench"
version_string = f"AutoGenBench version {__version__}"
commands: List[CommandSpec] = [
{
"command": "run",
"description": "run a given benchmark configuration",
"function": run_cli,
},
{
"command": "tabulate",
"description": "tabulate the results of a previous run",
"function": tabulate_cli,
},
{
"command": "--version",
"description": f"print the version of {invocation_cmd}",
"function": lambda _args: print(f"{version_string}"),
},
{"command": "--help", "description": "print this message", "function": None},
]
# Some help string formatting
commands_list = ", ".join(["'" + c["command"] + "'" for c in commands])
max_command_len = max([len(c["command"]) for c in commands])
commands_details = ""
for c in commands:
padded_cmd = c["command"]
while len(padded_cmd) < max_command_len:
padded_cmd = " " + padded_cmd
commands_details += f" {padded_cmd}: {c['description']}\n"
usage_text = f"""
{version_string}
usage: {invocation_cmd} COMMAND ARGS
Where, COMMAND is one of: {commands_list}
and ARGS are specific to the command.
(use '{invocation_cmd} COMMAND --help' for command-specific help)
""".strip()
help_text = f"""
{version_string}
usage: {invocation_cmd} COMMAND ARGS
{invocation_cmd} is a tool for running and managing AutoGen benchmark scenarios. A typically session might resemble:
{invocation_cmd} clone HumanEval
cd HumanEval
{invocation_cmd} run Tasks/human_eval_two_agents_gpt4.jsonl
which will download the HumanEval benchmark, expand it, and then run the benchmark once with the `human_eval_two_agents_gpt4` configuration.
Available COMMANDs include:
{commands_details}
Additionally, you can use the --help option with any command for further command-specific instructions. E.g.,
{invocation_cmd} run --help
{invocation_cmd} clone --help
""".strip()
if len(args) < 2:
sys.stderr.write(usage_text + "\n")
sys.exit(2)
for command in commands:
if args[1].lower() == command["command"]:
if command["function"] is None:
sys.stderr.write(help_text + "\n")
sys.exit(0)
else:
command["function"]([invocation_cmd + " " + command["command"]] + args[2:])
sys.exit(0)
# Command not found
sys.stderr.write(f"Invalid command '{args[1]}'. Available commands include: {commands_list}\n")
sys.exit(2)
###############################################################################
if __name__ == "__main__":
main()

View File

@ -0,0 +1,16 @@
import importlib.util
import os
import sys
from types import ModuleType
def load_module(module_path: str) -> ModuleType:
module_name = os.path.basename(module_path).replace(".py", "")
spec = importlib.util.spec_from_file_location(module_name, module_path)
if spec is None:
raise ValueError(f"Could not load module from path: {module_path}")
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
assert spec.loader is not None
spec.loader.exec_module(module)
return module

View File

@ -0,0 +1,45 @@
FROM python:3.11
MAINTAINER AutoGen
# Install packages
RUN apt-get update && apt-get install ffmpeg exiftool -y
# Set the image to the Pacific Timezone
RUN ln -snf /usr/share/zoneinfo/US/Pacific /etc/localtime && echo "US/Pacific" > /etc/timezone
# Upgrade pip
RUN pip install --upgrade pip
# Pre-load autogen dependencies, but not autogen itself since we'll often want to install the latest from source
RUN pip install pyautogen[teachable,lmm,graphs,websurfer]
RUN pip uninstall --yes pyautogen
# Pre-load popular packages as per https://learnpython.com/blog/most-popular-python-packages/
RUN pip install numpy pandas matplotlib seaborn scikit-learn requests urllib3 nltk pillow pytest
# Pre-load packages needed for complex_task file utils
RUN pip install python-docx pdfminer.six requests pillow easyocr python-pptx SpeechRecognition pandas openpyxl pydub mammoth puremagic youtube_transcript_api==0.6.0
# Pre-load Selenium and Playwright
RUN pip install selenium playwright
# Chromium playwright
RUN playwright install --with-deps chromium
RUN playwright install --with-deps firefox
# Chrome for Selenium (need to run dpkg twice to resolve dependencies)
# RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
# RUN dpkg -i google-chrome-stable_current_amd64.deb || :
# RUN apt -f install -y
# RUN dpkg -i google-chrome-stable_current_amd64.deb
# Fix an incompatibility with numpy
RUN pip uninstall --yes numpy
RUN pip install "numpy<2.0"
# Pre-load the OCR model
RUN /usr/bin/echo -e "import easyocr\nreader = easyocr.Reader(['en'])" | python
# Webarena
RUN pip install beartype aiolimiter
RUN /usr/bin/echo -e "import nltk\nnltk.download('punkt')" | python

View File

@ -0,0 +1,728 @@
import argparse
import errno
import json
import logging
import os
import pathlib
import random
import shutil
import subprocess
import sys
import time
import traceback
from typing import Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
import docker
from azure.core.exceptions import ClientAuthenticationError
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from docker.errors import APIError, DockerException, ImageNotFound
from docker.models.containers import Container
from typing_extensions import TypedDict
from .version import __version__
# Figure out where everything is
SCRIPT_PATH = os.path.realpath(__file__)
SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
TASK_TIMEOUT = 60 * 120 # 120 minutes
BASE_TEMPLATE_PATH = os.path.join(SCRIPT_DIR, "template")
RESOURCES_PATH = os.path.join(SCRIPT_DIR, "res")
# What platform are we running?
IS_WIN32 = sys.platform == "win32"
# This is the tag given to the image that is *built* when no other image is provided.
# Do not use this field to specify the name of an existing image (e.g., on Dockerhub)
DEFAULT_DOCKER_IMAGE_TAG = "agbench"
DEFAULT_ENV_FILE = "ENV.json"
# Get a random number generator for subsampling
subsample_rng = random.Random(425)
class ScenarioInstance(TypedDict):
id: str
template: Union[str, List[Union[str, List[str]]]]
substitutions: Dict[str, Dict[str, str]]
values: Dict[str, Dict[str, str]]
def run_scenarios(
scenario: str,
n_repeats: int,
is_native: bool,
token_provider: Optional[Callable[[], str]],
docker_image: Optional[str] = None,
results_dir: str = "Results",
subsample: Union[None, int, float] = None,
) -> None:
"""
Run a set autogenbench scenarios a given number of times.
Args:
scenario (path): The file or folder containing the scenario JSONL instances. If given a folder, then
all JSONL files in the folder will be loaded and run.
n_repeats (int): The number of times each scenario instance will be repeated
is_native (bool): True if the scenario should be run locally rather than in Docker (proceed with caution!)
results_dir (path): The folder were results will be saved.
"""
files: List[str] = []
# Figure out which files or folders we are working with
if scenario == "-" or os.path.isfile(scenario):
files.append(scenario)
elif os.path.isdir(scenario):
for f in os.listdir(scenario):
scenario_file = os.path.join(scenario, f)
if not os.path.isfile(scenario_file):
continue
if not scenario_file.lower().endswith(".jsonl"):
continue
files.append(scenario_file)
else:
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), scenario)
# Run all the scenario files
for scenario_file in files:
scenario_name: Optional[str] = None
scenario_dir: Optional[str] = None
file_handle = None
# stdin
if scenario_file == "-":
scenario_name = "stdin"
scenario_dir = "."
file_handle = sys.stdin
else:
scenario_name_parts = os.path.basename(scenario_file).split(".")
scenario_name_parts.pop()
scenario_name = ".".join(scenario_name_parts)
scenario_dir = os.path.dirname(os.path.realpath(scenario_file))
file_handle = open(scenario_file, "rt")
# Read all the lines, then subsample if needed
lines = [line for line in file_handle]
if subsample is not None:
# How many lines are we sampling
n = 0
# It's a proportion
if 0 <= subsample < 1:
n = int(len(lines) * subsample + 0.5)
# It's a raw count
else:
n = int(subsample)
n = max(0, min(n, len(lines)))
lines = subsample_rng.sample(lines, n)
for line in lines:
instance = json.loads(line)
# Create a folder to store the results
# Results base
if not os.path.isdir(results_dir):
os.mkdir(results_dir)
# Results for the scenario
results_scenario = os.path.join(results_dir, scenario_name)
if not os.path.isdir(results_scenario):
os.mkdir(results_scenario)
# Results for the instance
results_instance = os.path.join(results_scenario, instance["id"])
if not os.path.isdir(results_instance):
os.mkdir(results_instance)
# Results for the repeats
for i in range(0, n_repeats):
results_repetition = os.path.join(results_instance, str(i))
# Skip it if it already exists
if os.path.isdir(results_repetition):
print(f"Found folder {results_repetition} ... Skipping.")
continue
print(f"Running scenario {results_repetition}")
# Expand the scenario
expand_scenario(scenario_dir, instance, results_repetition)
# Prepare the environment (keys/values that need to be added)
env = get_scenario_env(token_provider)
# Run the scenario
if is_native:
run_scenario_natively(results_repetition, env)
else:
run_scenario_in_docker(
results_repetition,
env,
docker_image=docker_image,
)
# Close regular files
if scenario_file != "-":
file_handle.close()
def expand_scenario(scenario_dir: str, scenario: ScenarioInstance, output_dir: str) -> None:
"""
Expand a scenario into a folder.
Despite some awkwardness created by backwards compatibility and notational conveniences, expansion is conceptually simple.
It is a series of copy commands (similar to `cp -R`), followed by a series of in-place fine and replace operations.
"""
template = scenario["template"]
# Either key works for finding the substiturions list. "values" may be deprecated in the future
substitutions = scenario["substitutions"] if "substitutions" in scenario else scenario["values"]
# Older versions are only one-level deep. Convert them,
if len(substitutions) > 0 and isinstance(substitutions[next(iter(substitutions))], str):
substitutions = {"scenario.py": cast(Dict[str, str], substitutions)}
copy_operations: List[Tuple[str, str]] = []
# Handle file (str), folder (str), or mapping (List) templates
if isinstance(template, str):
template_path = os.path.join(scenario_dir, template)
if os.path.isdir(template_path):
copy_operations.append((template, ""))
else:
copy_operations.append((template, "scenario.py"))
elif isinstance(template, list):
for elm in template:
if isinstance(elm, list):
copy_operations.append((elm[0], elm[1]))
else:
copy_operations.append((elm, ""))
else:
raise ValueError("expand_scenario expects an str or list for 'template'")
# The global includes folder is always copied
shutil.copytree(
BASE_TEMPLATE_PATH,
output_dir,
ignore=shutil.ignore_patterns("*.example"),
dirs_exist_ok=False,
)
# Expand other folders
for items in copy_operations:
src_path = pathlib.Path(os.path.join(scenario_dir, items[0])).absolute()
dest_path = pathlib.Path(os.path.join(output_dir, items[1])).absolute()
if os.path.isdir(src_path):
shutil.copytree(src_path, dest_path, dirs_exist_ok=True)
else:
if os.path.isdir(dest_path):
# If the destination is a directory, use the same filename
shutil.copyfile(src_path, os.path.join(dest_path, os.path.basename(src_path)))
else:
# Otherwuse use the filename provided
shutil.copyfile(src_path, dest_path)
# Expand templated files
for templated_file in substitutions.keys(): # Keys are relative file paths
# Read the templated file into memory
template_contents: List[str] = list()
with open(os.path.join(output_dir, templated_file), "rt") as fh:
for line in fh:
template_contents.append(line)
# Rewrite the templated file with substitutions
values = substitutions[templated_file]
with open(os.path.join(output_dir, templated_file), "wt") as fh:
for line in template_contents:
for k, v in values.items():
line = line.replace(k, v)
fh.write(line)
def get_scenario_env(token_provider: Optional[Callable[[], str]], env_file: str = DEFAULT_ENV_FILE) -> Dict[str, str]:
"""
Return a dictionary of environment variables needed to run a scenario.
Args:
config_list (list): An Autogen OAI_CONFIG_LIST to be used when running scenarios.
env_file (str): The path to the env_file to read. (default: DEFAULT_ENV_FILE)
Returns: A dictionary of keys and values that need to be added to the system environment.
"""
env: Dict[str, str] = dict()
# Populate with commonly needed keys
openai_api_key = os.environ.get("OPENAI_API_KEY")
if openai_api_key is not None and len(openai_api_key.strip()) > 0:
env["OPENAI_API_KEY"] = openai_api_key
bing_api_key = os.environ.get("BING_API_KEY")
if bing_api_key is not None and len(bing_api_key.strip()) > 0:
env["BING_API_KEY"] = bing_api_key
## Support Azure auth tokens
azure_openai_ad_token = os.environ.get("AZURE_OPENAI_AD_TOKEN")
if not azure_openai_ad_token and token_provider:
azure_openai_ad_token = token_provider()
if azure_openai_ad_token is not None and len(azure_openai_ad_token.strip()) > 0:
env["AZURE_OPENAI_AD_TOKEN"] = azure_openai_ad_token
# Update with any values from the ENV.json file
if os.path.isfile(env_file):
with open(env_file, "rt") as fh:
env.update(json.loads(fh.read()))
return env
def run_scenario_natively(work_dir: str, env: Mapping[str, str], timeout: int = TASK_TIMEOUT) -> None:
"""
Run a scenario in the native environment.
Args:
work_dir (path): the path to the working directory previously created to house this sceario instance
"""
# Get the current working directory
cwd = os.getcwd()
# Prepare the environment variables
full_env = os.environ.copy()
full_env.update(env)
# Navigate to the scenario
os.chdir(work_dir)
print("\n\n" + os.getcwd() + "\n===================================================================")
# Prepare the run script
with open(os.path.join("run.sh"), "wt") as f:
f.write(
f"""#
echo RUN.SH STARTING !#!#
export AUTOGEN_TESTBED_SETTING="Native"
echo "autogenbench version: {__version__}" > timestamp.txt
# Create and activate the virtual environment
# This is called in a subprocess, and will not impact the parent
{sys.executable} -m venv .autogenbench_venv
. .autogenbench_venv/bin/activate
# Run the global init script if it exists
if [ -f global_init.sh ] ; then
. ./global_init.sh
fi
# Run the scenario init script if it exists
if [ -f scenario_init.sh ] ; then
. ./scenario_init.sh
fi
# Run the scenario
pip install -r requirements.txt
echo SCENARIO.PY STARTING !#!#
timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py
EXIT_CODE=$?
if [ $EXIT_CODE -ne 0 ]; then
echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
else
echo SCENARIO.PY COMPLETE !#!#
fi
# Clean up
if [ -d .cache ] ; then
rm -Rf .cache
fi
if [ -d __pycache__ ] ; then
rm -Rf __pycache__
fi
# Run the scenario finalize script if it exists
if [ -f scenario_finalize.sh ] ; then
. ./scenario_finalize.sh
fi
# Run the global finalize script if it exists
if [ -f global_finalize.sh ] ; then
. ./global_finalize.sh
fi
# We don't need to deactivate the venv because it's
# contained in the subprocess; but we should clean it up
if [ -d .autogenbench_venv ] ; then
rm -Rf .autogenbench_venv
fi
echo RUN.SH COMPLETE !#!#
"""
)
# Run the script and log the output
with open("console_log.txt", "wb") as f:
process = subprocess.Popen(
["sh", "run.sh"],
env=full_env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
)
for c in iter(lambda: process.stdout.read(1), b""): # type: ignore
f.write(c)
os.write(sys.stdout.fileno(), c) # Write binary to stdout
# Return where we started
os.chdir(cwd)
return
def run_scenario_in_docker(
work_dir: str, env: Mapping[str, str], timeout: int = TASK_TIMEOUT, docker_image: Optional[str] = None
) -> None:
"""
Run a scenario in a Docker environment.
Args:
work_dir (path): the path to the working directory previously created to house this sceario instance
timeout (Optional, int): the number of seconds to allow a Docker container to run before timing out
"""
client = docker.from_env()
image = None
# If the docker_image is None, then we will fetch DEFAULT_DOCKER_IMAGE_TAG, if present,
# or build it if missing.
if docker_image is None:
# Pull a suitable image
try:
image = client.images.get(DEFAULT_DOCKER_IMAGE_TAG)
except ImageNotFound:
print(f"Building default Docker image '{DEFAULT_DOCKER_IMAGE_TAG}'. This may take a few minutes...")
try:
build_default_docker_image(client, DEFAULT_DOCKER_IMAGE_TAG)
image = client.images.get(DEFAULT_DOCKER_IMAGE_TAG)
except DockerException:
print(f"Failed to build image '{DEFAULT_DOCKER_IMAGE_TAG}'")
# Otherwise get the requested image
else:
try:
image = client.images.get(docker_image)
except ImageNotFound:
# pull the image
print(f"Pulling image '{docker_image}'")
try:
image = client.images.pull(docker_image)
except DockerException:
print(f"Failed to pull image '{docker_image}'")
# Prepare the run script
with open(os.path.join(work_dir, "run.sh"), "wt", newline="\n") as f:
f.write(
f"""#
echo RUN.SH STARTING !#!#
export AUTOGEN_TESTBED_SETTING="Docker"
umask 000
echo "autogenbench version: {__version__}" > timestamp.txt
# Run the global init script if it exists
if [ -f global_init.sh ] ; then
. ./global_init.sh
fi
# Run the scenario init script if it exists
if [ -f scenario_init.sh ] ; then
. ./scenario_init.sh
fi
# Run the scenario
pip install -r requirements.txt
echo SCENARIO.PY STARTING !#!#
timeout --preserve-status --kill-after {timeout + 30}s {timeout}s python scenario.py
EXIT_CODE=$?
if [ $EXIT_CODE -ne 0 ]; then
echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
else
echo SCENARIO.PY COMPLETE !#!#
fi
# Clean up
if [ -d .cache ] ; then
rm -Rf .cache
fi
if [ -d __pycache__ ] ; then
rm -Rf __pycache__
fi
# Run the scenario finalize script if it exists
if [ -f scenario_finalize.sh ] ; then
. ./scenario_finalize.sh
fi
# Run the global finalize script if it exists
if [ -f global_finalize.sh ] ; then
. ./global_finalize.sh
fi
echo RUN.SH COMPLETE !#!#
"""
)
# Figure out what folders to mount
volumes = {str(pathlib.Path(work_dir).absolute()): {"bind": "/workspace", "mode": "rw"}}
# Add the autogen repo if we can find it
autogen_repo_base = os.environ.get("AUTOGENBENCH_REPO_BASE")
if autogen_repo_base is None:
autogen_repo_base = find_autogen_repo(os.getcwd())
elif not os.path.isdir(autogen_repo_base):
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), autogen_repo_base)
if autogen_repo_base is not None:
volumes[str(pathlib.Path(autogen_repo_base).absolute())] = {"bind": "/agnext", "mode": "rw"}
print("Mounting:")
for k in volumes:
bind = volumes[k]["bind"]
mode = volumes[k]["mode"].upper()
if bind == "/workspace":
k = os.path.relpath(k)
print(f"[{mode}]\t'{k}' => '{bind}'")
print("===================================================================")
assert image is not None
# Create and run the container
container: Container = cast(
Container,
client.containers.run(
image,
command=["sh", "run.sh"],
working_dir="/workspace",
environment=dict(env),
detach=True,
remove=True,
auto_remove=True,
# Type hint of docker is wrong here
volumes=volumes, # type: ignore
),
)
# Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
docker_timeout: float = timeout + 60 # One full minute after the bash timeout command should have already triggered
start_time = time.time()
logs = container.logs(stream=True)
log_file = open(os.path.join(work_dir, "console_log.txt"), "wt", encoding="utf-8")
stopping = False
exiting = False
while True:
try:
chunk = cast(bytes, next(logs)) # Manually step the iterator so it is captures with the try-catch
# Stream the data to the log file and the console
chunk_str = chunk.decode("utf-8")
log_file.write(chunk_str)
log_file.flush()
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
sys.stdout.write(chunk_str)
sys.stdout.flush()
# Check if we need to terminate
if not stopping and time.time() - start_time >= docker_timeout:
container.stop()
# Don't exit the loop right away, as there are things we may still want to read from the logs
# but remember how we got here.
stopping = True
except KeyboardInterrupt:
log_file.write("\nKeyboard interrupt (Ctrl-C). Attempting to exit gracefully.\n")
log_file.flush()
sys.stdout.write("\nKeyboard interrupt (Ctrl-C). Attempting to exit gracefully.\n")
sys.stdout.flush()
# Start the exit process, and give it a minute, but keep iterating
container.stop()
exiting = True
docker_timeout = time.time() - start_time + 60
except StopIteration:
break
# Clean up the container
try:
container.remove()
except APIError:
pass
if stopping: # By this line we've exited the loop, and the container has actually stopped.
log_file.write("\nDocker timed out.\n")
log_file.flush()
sys.stdout.write("\nDocker timed out.\n")
sys.stdout.flush()
if exiting: # User hit ctrl-C
sys.exit(1)
def build_default_docker_image(docker_client: docker.DockerClient, image_tag: str) -> None:
for segment in docker_client.api.build(
path=RESOURCES_PATH,
dockerfile="Dockerfile",
rm=True,
tag=image_tag,
decode=True,
):
if "stream" in segment:
sys.stdout.write(segment["stream"])
def find_autogen_repo(path: str) -> Optional[str]:
"""
Utility for identifying if the path is a subdirectory of the autogen repo.
Returns: the path to the root of the autogen repo if one is found, otherwise None
"""
# Normalize the path (we expect a directory)
path = os.path.abspath(path)
if os.path.isfile(path):
path = os.path.dirname(path)
while True:
test_path = os.path.join(path, "python", "src", "agnext") # We found agnext
if os.path.isdir(test_path):
return os.path.join(path, "python")
# Stop if we hit the root
parent_dir = os.path.abspath(os.path.join(path, os.pardir))
if parent_dir == path:
break
# Keep searching
path = parent_dir
return None
def run_cli(args: Sequence[str]) -> None:
invocation_cmd = args[0]
args = args[1:]
# Prepare the argument parser
parser = argparse.ArgumentParser(
prog=invocation_cmd,
description=f"{invocation_cmd} will run the specified AutoGen scenarios for a given number of repetitions and record all logs and trace information. When running in a Docker environment (default), each run will begin from a common, tightly controlled, environment. The resultant logs can then be further processed by other scripts to produce metrics.".strip(),
)
parser.add_argument(
"scenario",
help="The JSONL scenario file to run. If a directory is specified, then all JSONL scenarios in the directory are run. If set to '-', then read from stdin.",
)
parser.add_argument(
"-r",
"--repeat",
type=int,
help="The number of repetitions to run for each scenario (default: 1).",
default=1,
)
parser.add_argument(
"-s",
"--subsample",
type=str,
help='Run on a subsample of the tasks in the JSONL file(s). If a decimal value is specified, then run on the given proportion of tasks in each file. For example "0.7" would run on 70%% of tasks, and "1.0" would run on 100%% of tasks. If an integer value is specified, then randomly select *that* number of tasks from each specified JSONL file. For example "7" would run tasks, while "1" would run only 1 task from each specified JSONL file. (default: 1.0; which is 100%%)',
default=None,
)
parser.add_argument(
"-d",
"--docker-image",
type=str,
help="The Docker image to use when running scenarios. Can not be used together with --native. (default: '"
+ DEFAULT_DOCKER_IMAGE_TAG
+ "', which will be created if not present)",
default=None,
)
parser.add_argument(
"--native",
action="store_true",
help="Run the scenarios natively rather than in docker. NOTE: This is not advisable, and should be done with great caution.",
)
parsed_args = parser.parse_args(args)
# Don't allow both --docker-image and --native on the same command
if parsed_args.docker_image is not None and parsed_args.native:
sys.exit("The options --native and --docker-image can not be used together. Exiting.")
# Warn if running natively
if parsed_args.native:
if IS_WIN32:
sys.exit("Running scenarios with --native is not supported in Windows. Exiting.")
if parsed_args.requirements is not None:
sys.exit("--requirements is not compatible with --native. Exiting.")
sys.stderr.write(
"WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n"
)
# Does an environment variable override the prompt?
allow_native = os.environ.get("AUTOGENBENCH_ALLOW_NATIVE")
if allow_native is None or allow_native == "":
choice = input(
'Are you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
)
if choice.strip().lower() != "yes":
sys.exit("Received '" + choice + "'. Exiting.")
elif allow_native.strip().lower() != "yes":
sys.exit(f"Exiting because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
else:
sys.stderr.write(f"Continuing because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
time.sleep(0.75) # Pause very briefly so the message isn't lost in the noise
# Parse the subsample
subsample = None
if parsed_args.subsample is not None:
subsample = float(parsed_args.subsample)
if "." in parsed_args.subsample: # Intention is to run on a proportion
if subsample == 1.0: # Intention is to run 100%, which is the default
subsample = None # None means 100% ... which use None to differentiate from the integer 1
elif subsample < 0 or subsample > 1.0:
raise (
ValueError(
"Subsample must either be an integer (specified without a decimal), or a Real number between 0.0 and 1.0"
)
)
# Get the Azure bearer token generator if a token wasn't provided and there's any evidence of using Azure
azure_token_provider = None
if not os.environ.get("AZURE_OPENAI_AD_TOKEN") and os.path.isdir(pathlib.Path("~/.azure").expanduser()):
logging.disable(logging.CRITICAL)
try:
azure_token_provider = get_bearer_token_provider(
DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"
)
azure_token_provider() # Call it once to warm it up, and make sure it doesn't throw an error
print("Found Azure token provider.")
except ClientAuthenticationError:
error_message = traceback.format_exc()
azure_token_provider = None
print(
f"Azure token provider failed loading. Try using 'az login --use-device-code':\n{error_message}\n\nContinuing without Azure token provider..."
)
logging.disable(logging.NOTSET)
# Run the scenario
run_scenarios(
scenario=parsed_args.scenario,
n_repeats=parsed_args.repeat,
is_native=True if parsed_args.native else False,
token_provider=azure_token_provider,
docker_image=parsed_args.docker_image,
subsample=subsample,
)

View File

@ -0,0 +1,236 @@
import argparse
import os
import sys
from copy import deepcopy
from typing import Any, Callable, List, Optional, Sequence, Tuple
import tabulate as tb
from .load_module import load_module
# Figure out where everything is
SCRIPT_PATH = os.path.realpath(__file__)
SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
TABULATE_FILE = "custom_tabulate.py"
SUCCESS_STRINGS = [
"ALL TESTS PASSED !#!#",
]
EXCLUDE_DIR_NAMES = ["__pycache__"]
def find_tabulate_module(search_dir: str, stop_dir: Optional[str] = None) -> Optional[str]:
"""Hunt for the tabulate script."""
search_dir = os.path.abspath(search_dir)
if not os.path.isdir(search_dir):
raise ValueError(f"'{search_dir}' is not a directory.")
stop_dir = None if stop_dir is None else os.path.abspath(stop_dir)
while True:
path = os.path.join(search_dir, TABULATE_FILE)
if os.path.isfile(path):
return path
path = os.path.join(search_dir, "Scripts", TABULATE_FILE)
if os.path.isfile(path):
return path
path = os.path.join(search_dir, "scripts", TABULATE_FILE)
if os.path.isfile(path):
return path
# Stop if we hit the stop_dir
if search_dir == stop_dir:
break
# Stop if we hit the root
parent_dir = os.path.abspath(os.path.join(search_dir, os.pardir))
if parent_dir == search_dir:
break
search_dir = parent_dir
return None
def default_scorer(instance_dir: str, success_strings: List[str] = SUCCESS_STRINGS) -> Optional[bool]:
console_log = os.path.join(instance_dir, "console_log.txt")
if os.path.isfile(console_log):
with open(console_log, "rt") as fh:
content = fh.read()
for s in success_strings:
if s in content:
return True
return False
else:
return None
ScorerFunc = Callable[[str], Optional[bool]]
def default_tabulate(
args: List[str], scorer: ScorerFunc = default_scorer, exclude_dir_names: List[str] = EXCLUDE_DIR_NAMES
) -> Tuple[argparse.Namespace, List[List[Any]]]:
invocation_cmd = args[0]
args = args[1:]
warning = f"CAUTION: '{invocation_cmd}' is in early preview and is not thoroughly tested.\nPlease do not cite values from these calculations in academic work without first inspecting and verifying the results in the run logs yourself."
# Prepare the argument parser
parser = argparse.ArgumentParser(
prog=invocation_cmd,
description=f"{invocation_cmd} will tabulate the results of a previous run.",
)
parser.add_argument(
"runlogs",
help="The path where the run's logs are stored.",
)
parser.add_argument(
"-c",
"--csv",
action="store_true",
help="Output the results in CSV format.",
)
parser.add_argument(
"-e", "--excel", help="Output the results in Excel format. Please specify a path for the Excel file.", type=str
)
parsed_args = parser.parse_args(args)
runlogs: str = parsed_args.runlogs
all_results: List[List[Any]] = list()
max_instances = 0
for task_id in sorted(
os.listdir(runlogs),
key=lambda s: os.path.getmtime(os.path.join(runlogs, s)),
):
if task_id in exclude_dir_names:
continue
task_path = os.path.join(runlogs, task_id)
if not os.path.isdir(task_path):
continue
# Collect the results vector
results: List[Any] = [task_id]
instance = 0
instance_dir = os.path.join(task_path, str(instance))
while os.path.isdir(instance_dir):
results.append(scorer(instance_dir))
instance += 1
instance_dir = os.path.join(task_path, str(instance))
max_instances = max(max_instances, instance)
# Buffer the results
all_results.append(results)
if parsed_args.csv:
# Create a header
header = ["Task Id"]
for i in range(0, max_instances):
header.append("Trial " + str(i) + " Success")
print(",".join(header))
for row in all_results:
str_row = [f"{v}" if v is not None else "" for v in row]
while len(str_row) < max_instances + 1:
str_row.append("")
print(",".join(str_row))
# Print out alpha-version warning
sys.stderr.write("\n" + warning + "\n\n")
else:
# Create a header
header = ["\nTask Id"]
for i in range(0, max_instances):
header.append("Trial " + str(i) + "\nSuccess")
# Create the footer
def _count_equals(value: Optional[bool], trial: int) -> int:
count = 0
for row in all_results:
is_answer_matched = row[trial + 1][0] if isinstance(row[trial + 1], tuple) else row[trial + 1]
# Count missing
if value is None:
if trial + 1 < len(row):
if is_answer_matched is None:
count += 1
else:
count += 1
# Count match
elif trial + 1 < len(row) and is_answer_matched == value:
count += 1
return count
footer: List[Any] = []
footer_row: List[Any] = ["Successes"]
for i in range(0, max_instances):
footer_row.append(_count_equals(True, i))
footer.append(footer_row)
footer_row = ["Failures"]
for i in range(0, max_instances):
footer_row.append(_count_equals(False, i))
footer.append(footer_row)
footer_row = ["Missing"]
for i in range(0, max_instances):
footer_row.append(_count_equals(None, i))
footer.append(footer_row)
footer_row = ["Total"]
for i in range(0, max_instances):
footer_row.append(footer[0][i + 1] + footer[1][i + 1] + footer[2][i + 1])
footer.append(footer_row)
table = deepcopy(all_results)
for row in table:
for trial in range(0, max_instances):
if isinstance(row[trial + 1], tuple):
row[trial + 1] = row[trial + 1][0]
table.append(tb.SEPARATING_LINE) # type: ignore
table.extend(footer)
print(tb.tabulate(table, headers=header))
# Print out alpha-version warning
sys.stderr.write("\n" + warning + "\n\n")
return parsed_args, all_results
def tabulate_cli(args: Sequence[str]) -> None:
invocation_cmd = args[0]
args = args[1:]
# We won't assume much about the arguments, letting the dynamically-loaded
# tabulate modules parse the arguments however they want. But, we will use
# bare arguments (not starting a "-"), to help us find what module to load.
module_path = find_tabulate_module(os.getcwd(), stop_dir=os.getcwd())
for arg in reversed(args):
if module_path is not None:
break
if arg.startswith("-"):
continue
module_path = find_tabulate_module(arg)
# Load the module and hand over control
if module_path is None:
sys.stderr.write("Using default tabulation method.\n\n")
default_tabulate([invocation_cmd] + list(args))
else:
sys.stderr.write(f"Using tabulation method defined in '{module_path}'\n\n")
load_module(module_path).main([invocation_cmd] + list(args))

View File

@ -0,0 +1 @@
# Global finalize.

View File

@ -0,0 +1 @@
echo AUTOGEN_TESTBED_SETTING: [$AUTOGEN_TESTBED_SETTING]

View File

@ -0,0 +1 @@
__version__ = "0.0.1a1"