Initial work porting AutoGenBench to AGNext (#141)

* Initial check-in of agbench * Moved agbench to intended directory. * Removed pyautogen dependency * moved to using process_until_idle * Added TeamOne template. * User TeamOne agent classes. * migrate to hatch, move benchmarks out, add CI (#166) * Resolve type issues (#168) Thanks for fixing types. * Fixed import. --------- Co-authored-by: Jack Gerrits <jackgerrits@users.noreply.github.com>
2025-12-24 05:29:21 +00:00 · 2024-07-02 10:58:49 -07:00 · 2024-07-02 10:58:49 -07:00 · ca42b560e9
commit ca42b560e9
parent 766635394a
30 changed files with 2114 additions and 4 deletions
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        working-directory: ["./python", "./python/teams/team-one"]
+        working-directory: ["./python", "./python/teams/team-one", "./python/tools/agbench"]
    steps:
    - uses: actions/checkout@v4
    - name: Install Hatch
@ -25,7 +25,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        working-directory: ["./python", "./python/teams/team-one"]
+        working-directory: ["./python", "./python/teams/team-one", "./python/tools/agbench"]
    steps:
    - uses: actions/checkout@v4
    - name: Install Hatch
@ -37,7 +37,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        working-directory: ["./python", "./python/teams/team-one"]
+        working-directory: ["./python", "./python/teams/team-one", "./python/tools/agbench"]
    steps:
      - uses: actions/checkout@v4
      - name: Install Hatch
@ -49,7 +49,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        working-directory: ["./python", "./python/teams/team-one"]
+        working-directory: ["./python", "./python/teams/team-one", "./python/tools/agbench"]
    steps:
      - uses: actions/checkout@v4
      - name: Install Hatch
--- a/python/benchmarks/HumanEval/README.md
+++ b/python/benchmarks/HumanEval/README.md
@ -0,0 +1,21 @@
+# HumanEval Benchmark
+
+This scenario implements a modified version of the [HumanEval](https://arxiv.org/abs/2107.03374) benchmark.
+Compared to the original benchmark, there are **two key differences** here:
+
+- A chat model rather than a completion model is used.
+- The agents get pass/fail feedback about their implementations, and can keep trying until they succeed or run out of tokens or turns.
+
+## Running the tasks
+
+```
+autogenbench run Tasks/human_eval_two_agents.jsonl
+autogenbench tabulate Results/human_eval_two_agents
+```
+
+For faster development and iteration, a reduced HumanEval set is available via `Tasks/r_human_eval_two_agents.jsonl`, and contains only 26 problems of varying difficulty.
+
+## References
+**Evaluating Large Language Models Trained on Code**<br/>
+Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, Alex Ray, Raul Puri, Gretchen Krueger, Michael Petrov, Heidy Khlaaf, Girish Sastry, Pamela Mishkin, Brooke Chan, Scott Gray, Nick Ryder, Mikhail Pavlov, Alethea Power, Lukasz Kaiser, Mohammad Bavarian, Clemens Winter, Philippe Tillet, Felipe Petroski Such, Dave Cummings, Matthias Plappert, Fotios Chantzis, Elizabeth Barnes, Ariel Herbert-Voss, William Hebgen Guss, Alex Nichol, Alex Paino, Nikolas Tezak, Jie Tang, Igor Babuschkin, Suchir Balaji, Shantanu Jain, William Saunders, Christopher Hesse, Andrew N. Carr, Jan Leike, Josh Achiam, Vedant Misra, Evan Morikawa, Alec Radford, Matthew Knight, Miles Brundage, Mira Murati, Katie Mayer, Peter Welinder, Bob McGrew, Dario Amodei, Sam McCandlish, Ilya Sutskever, Wojciech Zaremba<br/>
+[https://arxiv.org/abs/2107.03374](https://arxiv.org/abs/2107.03374)
--- a/python/benchmarks/HumanEval/Scripts/custom_tabulate.py
+++ b/python/benchmarks/HumanEval/Scripts/custom_tabulate.py
@ -0,0 +1,12 @@
+import os
+import sys
+
+from agbench.tabulate_cmd import default_tabulate
+
+
+def main(args):
+    default_tabulate(args)
+
+
+if __name__ == "__main__" and __package__ is None:
+    main(sys.argv)
--- a/python/benchmarks/HumanEval/Scripts/init_tasks.py
+++ b/python/benchmarks/HumanEval/Scripts/init_tasks.py
@ -0,0 +1,124 @@
+#
+# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:
+# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)
+#
+
+import base64
+import gzip
+import io
+import json
+import os
+import re
+
+import requests
+
+URL = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
+
+SCRIPT_PATH = os.path.realpath(__file__)
+SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
+SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
+
+SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
+TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
+TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
+
+# A selected subset of HumanEval problems to work with during development
+
+# Deprecated 2/5/2024 -- Use subsample instead
+REDUCED_SET = [
+    "HumanEval/2",
+    "HumanEval/26",
+    "HumanEval/32",
+    "HumanEval/33",
+    "HumanEval/36",
+    "HumanEval/38",
+    "HumanEval/41",
+    "HumanEval/50",
+    "HumanEval/56",
+    "HumanEval/65",
+    "HumanEval/67",
+    "HumanEval/84",
+    "HumanEval/85",
+    "HumanEval/86",
+    "HumanEval/89",
+    "HumanEval/99",
+    "HumanEval/104",
+    "HumanEval/113",
+    "HumanEval/115",
+    "HumanEval/120",
+    "HumanEval/124",
+    "HumanEval/126",
+    "HumanEval/132",
+    "HumanEval/135",
+    "HumanEval/140",
+    "HumanEval/146",
+]
+
+
+def download_human_eval():
+    """Download the HumanEval dataset, un-gzips it, and returns a list of its parsed JSON objects."""
+
+    # Send a HTTP request to the URL of the file
+    response = requests.get(URL)
+
+    # Ensure we raise an error if the download failed
+    response.raise_for_status()
+
+    # Create a BytesIO object from the response content
+    buffer = io.BytesIO(response.content)
+
+    # Read the file, line by line, populating a list of parsed JSON objects
+    results = []
+    with gzip.GzipFile(fileobj=buffer) as f_in:
+        for line in f_in:
+            # Parse each line as JSON
+            results.append(json.loads(line))
+
+    return results
+
+
+def create_jsonl(name, tasks, template):
+    """Creates a JSONL scenario file with a given name, list of HumanEval tasks, and template path."""
+
+    # Create a task directory if it doesn't exist
+    if not os.path.isdir(TASKS_DIR):
+        os.mkdir(TASKS_DIR)
+
+    # Create the jsonl file
+    with open(os.path.join(TASKS_DIR, name + ".jsonl"), "wt") as fh:
+        for task in tasks:
+            print(f"Converting: [{name}] {task['task_id']}")
+
+            record = {
+                "id": task["task_id"].replace("/", "_"),
+                "template": template,
+                "substitutions": {
+                    "scenario.py": {"__ENTRY_POINT__": task["entry_point"]},
+                    "prompt.txt": {"__PROMPT__": task["prompt"]},
+                    "unit_tests.py": {"__TEST__": task["test"]},
+                },
+            }
+
+            fh.write(json.dumps(record).strip() + "\n")
+
+
+###############################################################################
+def main():
+    human_eval = download_human_eval()
+    # Deprecated: reduced_human_eval = [t for t in human_eval if t["task_id"] in REDUCED_SET]
+
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[re.sub(r"\s", "", entry.name)] = entry.path
+
+    # Create the various combinations of [models] x [templates]
+    for t in templates.items():
+        create_jsonl(f"human_eval_{t[0]}", human_eval, t[1])
+        # Deprecated: create_jsonl(f"r_human_eval_{t[0]}", reduced_human_eval, t[1])
+
+
+if __name__ == "__main__" and __package__ is None:
+    main()
--- a/python/benchmarks/HumanEval/Templates/TeamOne/prompt.txt
+++ b/python/benchmarks/HumanEval/Templates/TeamOne/prompt.txt
@ -0,0 +1 @@
+__PROMPT__
--- a/python/benchmarks/HumanEval/Templates/TeamOne/requirements.txt
+++ b/python/benchmarks/HumanEval/Templates/TeamOne/requirements.txt
@ -0,0 +1 @@
+/agnext/teams/team-one
--- a/python/benchmarks/HumanEval/Templates/TeamOne/scenario.py
+++ b/python/benchmarks/HumanEval/Templates/TeamOne/scenario.py
@ -0,0 +1,79 @@
+import asyncio
+#from typing import Any, Dict, List, Tuple, Union
+
+from agnext.application import SingleThreadedAgentRuntime
+from agnext.components.models import (
+    AzureOpenAIChatCompletionClient,
+    LLMMessage,
+    ModelCapabilities,
+    UserMessage,
+)
+from agnext.components.code_executor import LocalCommandLineCodeExecutor
+from team_one.agents.coder import Coder, Executor
+from team_one.agents.orchestrator import RoundRobinOrchestrator
+from team_one.messages import BroadcastMessage
+
+async def main() -> None:
+    # Create the runtime.
+    runtime = SingleThreadedAgentRuntime()
+
+    # Create the AzureOpenAI client, with AAD auth
+    #token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
+    client = AzureOpenAIChatCompletionClient(
+        api_version="2024-02-15-preview",
+        azure_endpoint="https://aif-complex-tasks-west-us-3.openai.azure.com/",
+        model="gpt-4o-2024-05-13",
+        model_capabilities=ModelCapabilities(function_calling=True, json_output=True, vision=True),
+        #azure_ad_token_provider=token_provider
+    )
+
+    # Register agents.
+    coder = runtime.register_and_get_proxy(
+        "Coder",
+        lambda: Coder(model_client=client),
+    )
+    executor = runtime.register_and_get_proxy(
+        "Executor",
+        lambda: Executor("A agent for executing code", executor=LocalCommandLineCodeExecutor())
+    )
+
+    runtime.register("orchestrator", lambda: RoundRobinOrchestrator([coder, executor]))
+
+    prompt = ""
+    with open("prompt.txt", "rt") as fh:
+        prompt = fh.read()
+
+    entry_point = "__ENTRY_POINT__" 
+
+    task = f"""
+The following python code imports the `run_tests` function from unit_tests.py, and runs
+it on the function `{entry_point}`. This will run a set of automated unit tests to verify the
+correct implementation of `{entry_point}`. However, `{entry_point}` is only partially
+implemented in the code below. Complete the implementation of `{entry_point}` and then execute
+a new stand-alone code block that contains everything needed to run the tests, including: importing
+`unit_tests`, calling `run_tests({entry_point})`, as well as {entry_point}'s complete definition,
+such that this code block can be run directly in Python.
+
+```python
+from unit_tests import run_tests
+
+{prompt}
+
+# Run the unit tests
+run_tests({entry_point})
+```
+""".strip()
+
+
+    await runtime.publish_message(BroadcastMessage(content=UserMessage(content=task, source="human")), namespace="default")
+
+    # Run the runtime until the task is completed.
+    await runtime.process_until_idle()
+
+if __name__ == "__main__":
+    import logging
+
+    logging.basicConfig(level=logging.WARNING)
+    logging.getLogger("agnext").setLevel(logging.DEBUG)
+    asyncio.run(main())
+
--- a/python/benchmarks/HumanEval/Templates/TeamOne/unit_tests.py
+++ b/python/benchmarks/HumanEval/Templates/TeamOne/unit_tests.py
@ -0,0 +1,15 @@
+# Disable ruff linter for template files
+# ruff: noqa: F821 E722
+import sys
+
+
+__TEST__
+
+
+def run_tests(candidate):
+    try:
+        check(candidate)
+        # We can search for this string in the output
+        print("ALL TESTS PASSED !#!#")
+    except AssertionError:
+        sys.exit("SOME TESTS FAILED - TRY AGAIN !#!#")
--- a/python/benchmarks/HumanEval/Templates/TwoAgents/prompt.txt
+++ b/python/benchmarks/HumanEval/Templates/TwoAgents/prompt.txt
@ -0,0 +1 @@
+__PROMPT__
--- a/python/benchmarks/HumanEval/Templates/TwoAgents/requirements.txt
+++ b/python/benchmarks/HumanEval/Templates/TwoAgents/requirements.txt
@ -0,0 +1 @@
+/agnext
--- a/python/benchmarks/HumanEval/Templates/TwoAgents/scenario.py
+++ b/python/benchmarks/HumanEval/Templates/TwoAgents/scenario.py
@ -0,0 +1,213 @@
+import asyncio
+import json
+import re
+import uuid
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple, Union
+
+from agnext.application import SingleThreadedAgentRuntime
+from agnext.components import FunctionCall, TypeRoutedAgent, message_handler
+from agnext.components.code_executor import CodeBlock, CodeExecutor, LocalCommandLineCodeExecutor
+from agnext.components.models import (
+    AssistantMessage,
+    AzureOpenAIChatCompletionClient,
+    ChatCompletionClient,
+    FunctionExecutionResult,
+    FunctionExecutionResultMessage,
+    LLMMessage,
+    ModelCapabilities,
+    OpenAIChatCompletionClient,
+    SystemMessage,
+    UserMessage,
+)
+from agnext.components.tools import CodeExecutionResult, PythonCodeExecutionTool
+from agnext.core import AgentId, CancellationToken
+
+#from azure.identity import DefaultAzureCredential, get_bearer_token_provider
+
+@dataclass
+class TaskMessage:
+    content: str
+
+@dataclass
+class CodeExecutionRequestMessage:
+    session_id: str
+    execution_request: str
+
+@dataclass
+class CodeExecutionResultMessage:
+    session_id: str
+    output: str
+    exit_code: int
+
+class Coder(TypeRoutedAgent):
+    """An agent that uses tools to write, execute, and debug Python code."""
+
+    DEFAULT_DESCRIPTION = "A Python coder assistant."
+
+    DEFAULT_SYSTEM_MESSAGES = [
+        SystemMessage("""You are a helpful AI assistant. Solve tasks using your Python coding skills. The code you output must be formatted in Markdown code blocks demarcated by triple backticks (```). As an example:
+
+```python
+
+def main():
+    print("Hello world.")
+
+if __name__ == "__main__":
+    main()
+```
+
+The user cannot provide any feedback or perform any other action beyond executing the code you suggest. In particular, the user can't modify your code, and can't copy and paste anything, and can't fill in missing values. Thus, do not suggest incomplete code which requires users to perform any of these actions. 
+
+Check the execution result returned by the user. If the result indicates there is an error, fix the error and output the code again. Suggest the full code instead of partial code or code changes -- code blocks must stand alone and be ready to execute without modification. If the error can't be fixed or if the task is not solved even after the code is executed successfully, analyze the problem, revisit your assumption, and think of a different approach to try.
+
+If the code has executed successfully, and the problem is stolved, reply "TERMINATE".
+""")
+    ]
+
+    def __init__(
+        self,
+        model_client: ChatCompletionClient,
+        description: str = DEFAULT_DESCRIPTION,
+        system_messages: List[SystemMessage] = DEFAULT_SYSTEM_MESSAGES,
+        max_turns: int | None = None,
+    ) -> None:
+        super().__init__(description)
+        self._model_client = model_client
+        self._system_messages = system_messages
+        self._session_memory: Dict[str, List[LLMMessage]] = {}
+        self._max_turns = max_turns
+
+    @message_handler
+    async def handle_user_message(
+        self, message: TaskMessage, cancellation_token: CancellationToken
+    ) -> None:
+        """Handle a user message, execute the model and tools, and returns the response."""
+        # Create a new session.
+        session_id = str(uuid.uuid4())
+        self._session_memory.setdefault(session_id, []).append(UserMessage(content=message.content, source="user"))
+
+        # Make an inference to the model.
+        response = await self._model_client.create(self._system_messages + self._session_memory[session_id])
+        assert isinstance(response.content, str)
+        self._session_memory[session_id].append(AssistantMessage(content=response.content, source=self.metadata["name"]))            
+
+        await self.publish_message(CodeExecutionRequestMessage(execution_request=response.content, session_id=session_id), cancellation_token=cancellation_token)
+
+   
+    @message_handler
+    async def handle_code_execution_result(self, message: CodeExecutionResultMessage, cancellation_token: CancellationToken) -> None:
+
+        execution_result = f"The script ran, then exited with Unix exit code: {message.exit_code}\nIts output was:\n{message.output}"
+
+        # Store the code execution output.
+        self._session_memory[message.session_id].append(UserMessage(content=execution_result, source="user"))
+
+        # Count the number of rounds so far
+        if self._max_turns is not None:
+            n_turns = sum(1 for message in self._session_memory[message.session_id] if isinstance(message, AssistantMessage))
+            if n_turns >= self._max_turns:
+                return
+
+        # Make an inference to the model.
+        response = await self._model_client.create(self._system_messages + self._session_memory[message.session_id])
+        assert isinstance(response.content, str)
+        self._session_memory[message.session_id].append(AssistantMessage(content=response.content, source=self.metadata["name"]))
+
+        if "TERMINATE" in response.content:
+            return 
+        else:
+            await self.publish_message(CodeExecutionRequestMessage(execution_request=response.content, session_id=message.session_id), cancellation_token=cancellation_token)
+        
+
+class Executor(TypeRoutedAgent):
+
+    def __init__(self, description: str, executor: CodeExecutor) -> None:
+        super().__init__(description)
+        self._executor = executor
+    
+    @message_handler
+    async def handle_code_execution(self, message: CodeExecutionRequestMessage, cancellation_token: CancellationToken) -> None:
+
+        # Extract code block from the message.
+        code = self._extract_execution_request(message.execution_request)
+        if code is not None:
+            execution_requests = [CodeBlock(code=code, language="python")]
+            future = asyncio.get_event_loop().run_in_executor(None, self._executor.execute_code_blocks, execution_requests)
+            cancellation_token.link_future(future)
+            result = await future
+            await self.publish_message(CodeExecutionResultMessage(output=result.output, exit_code=result.exit_code, session_id=message.session_id))
+        else:
+            await self.publish_message(CodeExecutionResultMessage(output="No code block detected. Please provide a markdown-encoded code block to execute.", exit_code=1, session_id=message.session_id))
+
+    def _extract_execution_request(self, markdown_text: str) -> Union[str, None]:
+        pattern = r"```(\w+)\n(.*?)\n```"
+        # Search for the pattern in the markdown text
+        match = re.search(pattern, markdown_text, re.DOTALL)
+        # Extract the language and code block if a match is found
+        if match:
+            return match.group(2)
+        return None
+
+async def main() -> None:
+    # Create the runtime.
+    runtime = SingleThreadedAgentRuntime()
+
+    # Create the AzureOpenAI client, with AAD auth
+    #token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
+    client = AzureOpenAIChatCompletionClient(
+        api_version="2024-02-15-preview",
+        azure_endpoint="https://aif-complex-tasks-west-us-3.openai.azure.com/",
+        model="gpt-4o-2024-05-13",
+        model_capabilities=ModelCapabilities(function_calling=True, json_output=True, vision=True),
+        #azure_ad_token_provider=token_provider
+    )
+
+    # Register agents.
+    coder = runtime.register_and_get(
+        "Coder",
+        lambda: Coder(model_client=client),
+    )
+    runtime.register(
+        "Executor",
+        lambda: Executor("A agent for executing code", executor=LocalCommandLineCodeExecutor())
+    )
+
+    prompt = ""
+    with open("prompt.txt", "rt") as fh:
+        prompt = fh.read()
+
+    entry_point = "__ENTRY_POINT__" 
+
+    task = TaskMessage(f"""
+The following python code imports the `run_tests` function from unit_tests.py, and runs
+it on the function `{entry_point}`. This will run a set of automated unit tests to verify the
+correct implementation of `{entry_point}`. However, `{entry_point}` is only partially
+implemented in the code below. Complete the implementation of `{entry_point}` and then execute
+a new stand-alone code block that contains everything needed to run the tests, including: importing
+`unit_tests`, calling `run_tests({entry_point})`, as well as {entry_point}'s complete definition,
+such that this code block can be run directly in Python.
+
+```python
+from unit_tests import run_tests
+
+{prompt}
+
+# Run the unit tests
+run_tests({entry_point})
+```
+""".strip())
+
+    # Send a task to the tool user.
+    await runtime.send_message(task, coder)
+
+    # Run the runtime until the task is completed.
+    await runtime.process_until_idle()
+
+if __name__ == "__main__":
+    import logging
+
+    logging.basicConfig(level=logging.WARNING)
+    logging.getLogger("agnext").setLevel(logging.DEBUG)
+    asyncio.run(main())
+
--- a/python/benchmarks/HumanEval/Templates/TwoAgents/unit_tests.py
+++ b/python/benchmarks/HumanEval/Templates/TwoAgents/unit_tests.py
@ -0,0 +1,15 @@
+# Disable ruff linter for template files
+# ruff: noqa: F821 E722
+import sys
+
+
+__TEST__
+
+
+def run_tests(candidate):
+    try:
+        check(candidate)
+        # We can search for this string in the output
+        print("ALL TESTS PASSED !#!#")
+    except AssertionError:
+        sys.exit("SOME TESTS FAILED - TRY AGAIN !#!#")
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@ -84,6 +84,24 @@ build = "sphinx-build docs/src docs/build"
 serve = "sphinx-autobuild --watch src docs/src docs/build"
 check = "sphinx-build --fail-on-warning docs/src docs/build"

+# Benchmark environments
+[tool.hatch.envs.bench-humaneval-teamone]
+installer = "uv"
+detached = true
+dependencies = [
+    "agnext@{root:uri}",
+    "agbench@{root:uri}/tools/agbench",
+    "team-one@{root:uri}/teams/team-one",
+]
+
+[tool.hatch.envs.bench-humaneval-twoagents]
+installer = "uv"
+detached = true
+dependencies = [
+    "agnext@{root:uri}",
+    "agbench@{root:uri}/tools/agbench",
+]
+
 [tool.ruff]
 line-length = 120
 fix = true
--- a/python/tools/agbench/.gitignore
+++ b/python/tools/agbench/.gitignore
@ -0,0 +1,3 @@
+scenarios/*/Downloads
+scenarios/*/Tasks
+*/Results
--- a/python/tools/agbench/CONTRIBUTING.md
+++ b/python/tools/agbench/CONTRIBUTING.md
@ -0,0 +1,188 @@
+# Contributing to AutoGenBench
+
+As part of the broader AutoGen project, AutoGenBench welcomes community contributions. Contributions are subject to AutoGen's [contribution guidelines](https://microsoft.github.io/autogen/docs/Contribute), as well as a few additional AutoGenBench-specific requirements outlined here. You may also wish to develop your own private benchmark scenarios and the guidance in this document will help with such efforts as well. Below you will find the general requirements, followed by a detailed technical description.
+
+## General Contribution Requirements
+We ask that all contributions to AutoGenBench adhere to the following:
+
+- Follow AutoGen's broader [contribution guidelines](https://microsoft.github.io/autogen/docs/Contribute)
+- All AutoGenBench benchmarks should live in a subfolder of `/samples/tools/autogenbench/scenarios` alongside `HumanEval`, `GAIA`, etc.
+- Benchmark scenarios should include a detailed README.md, in the root of their folder, describing the benchmark and providing citations where warranted.
+- Benchmark data (tasks, ground truth, etc.) should be downloaded from their original sources rather than hosted in the AutoGen repository (unless the benchmark is original, and the repository *is* the original source)
+    - You can use the `Scripts/init_tasks.py` file to automate this download.
+- Basic scoring should be compatible with the `autogenbench tabulate` command (e.g., by outputting logs compatible with the default tabulation mechanism, or by providing a `Scripts/custom_tabulate.py` file)
+- If you wish your benchmark to be compatible with the `autogenbench clone` command, include a `MANIFEST.json` file in the root of your folder.
+
+These requirements are further detailed below, but if you simply copy the `HumanEval` folder, you will already be off to a great start.
+
+## Implementing and Running Benchmark Tasks
+At the core of any benchmark is a set of tasks. To implement tasks that are runnable by AutoGenBench, you must adhere to AutoGenBench's templating and scenario expansion algorithms, as outlined below.
+
+### Task Definitions
+
+All tasks are stored in JSONL files (in subdirectories under `./Tasks`). Each line of a tasks file is a JSON object with the following schema:
+
+```
+{
+   "id": string,
+   "template": dirname,
+   "substitutions" {
+       "filename1": {
+       	   "find_string1_1": replace_string1_1,
+           "find_string1_2": replace_string1_2,
+           ...
+           "find_string1_M": replace_string1_N
+       }
+       "filename2": {
+       	   "find_string2_1": replace_string2_1,
+           "find_string2_2": replace_string2_2,
+           ...
+           "find_string2_N": replace_string2_N
+       }
+   }
+}
+```
+
+For example:
+
+```
+{
+    "id": "two_agent_stocks_gpt4",
+    "template": "default_two_agents",
+    "substitutions": {
+	"scenario.py": {
+            "__MODEL__": "gpt-4",
+	},
+	"prompt.txt": {
+            "__PROMPT__": "Plot and save to disk a chart of NVDA and TESLA stock price YTD."
+        }
+    }
+}
+```
+
+In this example, the string `__MODEL__` will be replaced in the file `scenarios.py`, while the string `__PROMPT__` will be replaced in the `prompt.txt` file.
+
+The `template` field can also take on a list value, but this usage is considered advanced and is not described here. See the `autogenbench/run_cmd.py` code, or the `GAIA` benchmark tasks files for additional information about this option.
+
+
+## Task Instance Expansion Algorithm
+
+Once the tasks have been defined, as per above, they must be "instantiated" before they can be run. This instantiation happens automatically when the user issues the `autogenbench run` command and involves creating a local folder to share with Docker. Each instance and repetition gets its own folder along the path: `./results/[scenario]/[task_id]/[instance_id]`. For the sake of brevity we will refer to this folder as the `DEST_FOLDER`.
+
+The algorithm for populating the `DEST_FOLDER` is as follows:
+
+1. Pre-populate DEST_FOLDER with all the basic starter files for running a scenario (found in `autogenbench/template`).
+2. Recursively copy the template folder specified in the JSONL line to DEST_FOLDER (if the JSON `template` attribute points to a folder) If the JSONs `template` attribute instead points to a file, copy the file, but rename it to `scenario.py`
+3. Apply any string replacements, as outlined in the prior section.
+4. Write a run.sh file to DEST_FOLDER that will be executed by Docker when it is loaded. The `run.sh` is described below.
+
+## Scenario Execution Algorithm
+
+Once the task has been instantiated it is run (via run.sh). This script will execute the following steps:
+
+1. If a file named `global_init.sh` is present, run it.
+2. If a file named `scenario_init.sh` is present, run it.
+3. Install the requirements.txt file (if running in Docker)
+4. Run the task via `python scenario.py`
+5. If the scenario.py exited cleanly (exit code 0), then print "SCENARIO.PY COMPLETE !#!#"
+6. Clean up (delete cache, etc.)
+7. If a file named `scenario_finalize.sh` is present, run it.
+8. If a file named `global_finalize.sh` is present, run it.
+9. echo "RUN COMPLETE !#!#", signaling that all steps completed.
+
+Notably, this means that scenarios can add custom init and teardown logic by including `scenario_init.sh` and `scenario_finalize.sh` files.
+
+At the time of this writing, the run.sh file is as follows:
+
+```sh
+export AUTOGEN_TESTBED_SETTING="Docker"
+umask 000
+
+# Run the global init script if it exists
+if [ -f global_init.sh ] ; then
+    . ./global_init.sh
+fi
+
+# Run the scenario init script if it exists
+if [ -f scenario_init.sh ] ; then
+    . ./scenario_init.sh
+fi
+
+# Run the scenario
+pip install -r requirements.txt
+python scenario.py
+EXIT_CODE=$?
+if [ $EXIT_CODE -ne 0 ]; then
+    echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
+else
+    echo SCENARIO.PY COMPLETE !#!#
+fi
+
+# Clean up
+if [ -d .cache ] ; then
+    rm -Rf .cache
+fi
+
+# Run the scenario finalize script if it exists
+if [ -f scenario_finalize.sh ] ; then
+    . ./scenario_finalize.sh
+fi
+
+# Run the global finalize script if it exists
+if [ -f global_finalize.sh ] ; then
+    . ./global_finalize.sh
+fi
+
+echo RUN.SH COMPLETE !#!#
+```
+
+Be warned that this listing is provided here for illustration purposes, and may vary over time. The source of truth are the `run.sh` files found in the ``./results/[taskset]/[task_id]/[instance_id]`` folders.
+
+
+## Integrating with the `tabulate` and `clone` commands.
+
+The above details are sufficient for defining and running tasks, but if you wish to support the `autogenbench tabulate` and `autogenbench clone` commands, a few additional steps are required.
+
+### Tabulations
+
+If you wish to leverage the default tabulation logic, it is as simple as arranging your `scenario.py` file to output the string "ALL TESTS PASSED !#!#" to the console in the event that a task was solved correctly.
+
+If you wish to implement your own tabulation logic, simply create the file `Scripts/custom_tabulate.py` and include a `main(args)` method. Here, the `args` parameter will be provided by AutoGenBench, and is a drop-in replacement for `sys.argv`. In particular, `args[0]` will be the invocation command (similar to the executable or script name in `sys.argv`), and the remaining values (`args[1:]`) are the command line parameters.
+
+Should you provide a custom tabulation script, please implement `--help` and `-h` options for documenting your interface.
+
+The `scenarios/GAIA/Scripts/custom_tabulate.py` is a great example of custom tabulation. It also shows how you can reuse some components of the default tabulator to speed up development.
+
+
+### Cloning
+
+If you wish your benchmark to be available via the `autogenbench clone` command, you will need to take three additional steps:
+
+#### Manifest
+First, provide a `MANIFEST.json` file in the root of your benchmark. An example is provided below, from which you can see the schema:
+
+```json
+{
+    "files": {
+        "Templates/TwoAgents/prompt.txt": "Templates/TwoAgents/prompt.txt",
+        "Templates/TwoAgents/coding/my_tests.py": "Templates/TwoAgents/coding/my_tests.py",
+        "Templates/TwoAgents/scenario.py": "Templates/TwoAgents/scenario.py",
+        "README.md": "README.md",
+	"Scripts/init_tasks.py": "Scripts/init_tasks.py",
+	"Scripts/custom_tabulate.py": "Scripts/custom_tabulate.py"
+    }
+}
+```
+
+The keys of the `files` dictionary are local paths, relative to your benchmark's root directory. The values are relative paths in the AutoGen GitHub repository (relative to the folder where the MANIFEST.json file is located). In most cases, the keys and values will be identical.
+
+#### SCENARIOS dictionary
+Second, you must add an entry to the `scenarios` dictionary in `autogen/samples/tools/autogenbench/scenarios/MANIFEST.json`.
+
+#### Scripts/init_tasks.py
+Finally, you should provide an `Scripts/init_tasks.py` file, in your benchmark folder, and include a `main()` method therein. This method will be loaded and called automatically by `autogenbench clone` after all manifest files have been downloaded.
+
+This `init_tasks.py` script is a great place to download benchmarks from their original sources and convert them to the JSONL format required by AutoGenBench:
+- See `HumanEval/Scripts/init_tasks.py` for an example of how to expand a benchmark from an original GitHub repository.
+- See `GAIA/Scripts/init_tasks.py` for an example of how to expand a benchmark from `Hugging Face Hub`.
+- See `MATH/SCripts/init_tasks.py` for an example of how to expand a benchmark from an author-hosted website.
--- a/python/tools/agbench/MANIFEST.in
+++ b/python/tools/agbench/MANIFEST.in
@ -0,0 +1,4 @@
+recursive-exclude  scenarios *
+recursive-exclude  results *
+recursive-exclude  tests *
+recursive-exclude  utils *
--- a/python/tools/agbench/README.md
+++ b/python/tools/agbench/README.md
@ -0,0 +1,174 @@
+# AutoGenBench
+
+AutoGenBench is a tool for repeatedly running a set of pre-defined AutoGen tasks in a setting with tightly-controlled initial conditions. With each run, AutoGenBench will start from a blank slate. The agents being evaluated will need to work out what code needs to be written, and what libraries or dependencies to install, to solve tasks. The results of each run are logged, and can be ingested by analysis or metrics scripts (such as `autogenbench tabulate`). By default, all runs are conducted in freshly-initialized docker containers, providing the recommended level of consistency and safety.
+
+AutoGenBench works with all AutoGen 0.1.*, and 0.2.* versions.
+
+## Technical Specifications
+
+If you are already an AutoGenBench pro, and want the full technical specifications, please review the [contributor's guide](CONTRIBUTING.md).
+
+
+## Docker Requirement
+AutoGenBench also requires Docker (Desktop or Engine). **It will not run in GitHub codespaces**, unless you opt for native execution (with is strongly discouraged). To install Docker Desktop see [https://www.docker.com/products/docker-desktop/](https://www.docker.com/products/docker-desktop/).
+
+## Installation and Setup
+
+**To get the most out of AutoGenBench, the `autogenbench` package should be installed**. At present, the easiest way to do this is to install it via `pip`:
+
+```
+pip install autogenbench
+```
+
+If you would prefer working from source code (e.g., for development, or to utilize an alternate branch), simply clone the [AutoGen](https://github.com/microsoft/autogen) repository, then install `autogenbench` via:
+
+```
+pip install -e autogen/samples/tools/autogenbench
+```
+
+After installation, you must configure your API keys. As with other AutoGen applications, AutoGenBench will look for the OpenAI keys in the OAI_CONFIG_LIST file in the current working directory, or the OAI_CONFIG_LIST environment variable. This behavior can be overridden using a command-line parameter described later.
+
+If you will be running multiple benchmarks, it is often most convenient to leverage the environment variable option. You can load your keys into the environment variable by executing:
+
+```
+export OAI_CONFIG_LIST=$(cat ./OAI_CONFIG_LIST)
+```
+
+If an OAI_CONFIG_LIST is *not* provided (by means of file or environment variable), AutoGenBench will use the OPENAI_API_KEY environment variable instead.
+
+
+For some benchmark scenarios, additional keys may be required (e.g., keys for the Bing Search API). These can be added to an `ENV.json` file in the current working folder. An example `ENV.json` file is provided below:
+
+```
+{
+    "BING_API_KEY": "xxxyyyzzz"
+}
+```
+
+## A Typical Session
+Once AutoGenBench and necessary keys are installed, a typical session will look as follows:
+
+```
+autogenbench clone HumanEval
+cd HumanEval
+autogenbench run Tasks/r_human_eval_two_agents.jsonl
+autogenbench tabulate results/r_human_eval_two_agents
+```
+
+Where:
+- `autogenbench clone HumanEval` downloads and expands the HumanEval benchmark scenario.
+- `autogenbench run Tasks/r_human_eval_two_agents.jsonl` runs the tasks defined in `Tasks/r_human_eval_two_agents.jsonl`
+- `autogenbench tablue results/r_human_eval_two_agents` tabulates the results of the run
+
+Each of these commands has extensive in-line help via:
+
+- `autogenbench --help`
+- `autogenbench clone --help`
+- `autogenbench run --help`
+- `autogenbench tabulate --help`
+
+**NOTE:** If you are running `autogenbench` from within the repository, you don’t need to run `autogenbench clone`. Instead, navigate to the appropriate scenario folder (e.g., `scenarios/HumanEval`) and run the `Scripts/init_tasks.py` file.
+
+More details of each command are provided in the sections that follow.
+
+## Cloning Benchmarks
+To clone an existing benchmark, simply run:
+```
+autogenbench clone [BENCHMARK]
+```
+
+For example,
+
+```
+autogenbench clone HumanEval
+```
+
+To see which existing benchmarks are available to clone, run:
+
+```
+autogenbench clone --list
+```
+
+> Note: You might need to log in to HuggingFace to access certain datasets like GAIA. To do this, run `huggingface-cli login` in your terminal and follow the prompts.
+
+## Running AutoGenBench
+
+To run a benchmark (which executes the tasks, but does not compute metrics), simply execute:
+```
+cd [BENCHMARK]
+autogenbench run Tasks
+```
+
+For example,
+```
+cd HumanEval
+autogenbench run Tasks
+```
+
+The default is to run each task once. To run each scenario 10 times, use:
+
+```
+autogenbench run --repeat 10 Tasks
+```
+
+The `autogenbench` command-line tool allows a number of command-line arguments to control various parameters of execution. Type ``autogenbench -h`` to explore these options:
+
+```
+'autogenbench run' will run the specified autogen scenarios for a given number of repetitions and record all logs and trace information. When running in a Docker environment (default), each run will begin from a common, tightly controlled, environment. The resultant logs can then be further processed by other scripts to produce metrics.
+
+positional arguments:
+  scenario      The JSONL scenario file to run. If a directory is specified,
+                then all JSONL scenarios in the directory are run. (default:
+                ./scenarios)
+
+options:
+  -h, --help            show this help message and exit
+  -c CONFIG, --config CONFIG
+                        The environment variable name or path to the OAI_CONFIG_LIST (default: OAI_CONFIG_LIST).
+  -r REPEAT, --repeat REPEAT
+                        The number of repetitions to run for each scenario (default: 1).
+  -s SUBSAMPLE, --subsample SUBSAMPLE
+                        Run on a subsample of the tasks in the JSONL file(s). If a decimal value is specified, then run on
+                        the given proportion of tasks in each file. For example "0.7" would run on 70% of tasks, and "1.0"
+                        would run on 100% of tasks. If an integer value is specified, then randomly select *that* number of
+                        tasks from each specified JSONL file. For example "7" would run tasks, while "1" would run only 1
+                        task from each specified JSONL file. (default: 1.0; which is 100%)
+  -m MODEL, --model MODEL
+                        Filters the config_list to include only models matching the provided model name (default: None, which
+                        is all models).
+  --requirements REQUIREMENTS
+                        The requirements file to pip install before running the scenario.
+  -d DOCKER_IMAGE, --docker-image DOCKER_IMAGE
+                        The Docker image to use when running scenarios. Can not be used together with --native. (default:
+                        'autogenbench:default', which will be created if not present)
+  --native              Run the scenarios natively rather than in docker. NOTE: This is not advisable, and should be done
+                        with great caution.
+```
+
+## Results
+
+By default, the AutoGenBench stores results in a folder hierarchy with the following template:
+
+``./results/[scenario]/[task_id]/[instance_id]``
+
+For example, consider the following folders:
+
+``./results/default_two_agents/two_agent_stocks/0``
+``./results/default_two_agents/two_agent_stocks/1``
+
+...
+
+``./results/default_two_agents/two_agent_stocks/9``
+
+This folder holds the results for the ``two_agent_stocks`` task of the ``default_two_agents`` tasks file. The ``0`` folder contains the results of the first instance / run. The ``1`` folder contains the results of the second run, and so on. You can think of the _task_id_ as mapping to a prompt, or a unique set of parameters, while the _instance_id_ defines a specific attempt or run.
+
+Within each folder, you will find the following files:
+
+- *timestamp.txt*: records the date and time of the run, along with the version of the pyautogen library installed
+- *console_log.txt*: all console output produced by Docker when running AutoGen. Read this like you would a regular console.
+- *[agent]_messages.json*: for each Agent, a log of their messages dictionaries
+- *./coding*: A directory containing all code written by AutoGen, and all artifacts produced by that code.
+
+## Contributing or Defining New Tasks or Benchmarks
+
+If you would like to develop -- or even contribute -- your own tasks or benchmarks, please review the [contributor's guide](CONTRIBUTING.md) for complete technical details.
--- a/python/tools/agbench/pyproject.toml
+++ b/python/tools/agbench/pyproject.toml
@ -0,0 +1,96 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "agbench"
+dynamic = ["version"]
+authors = [
+  { name="Adam Fourney", email="adamfo@microsoft.com" },
+]
+description = "AGNext Benchmarking Tools"
+readme = "README.md"
+requires-python = ">=3.8, <3.13"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+
+dependencies = [
+    "openai",
+    "docker",
+    "huggingface_hub",
+    "tabulate",
+    "azure-identity",
+]
+
+
+[tool.hatch.envs.default]
+installer = "uv"
+dependencies = [
+    "pyright==1.1.368",
+    "mypy==1.10.0",
+    "ruff==0.4.8",
+    "types-tabulate",
+    "types-docker"
+]
+
+[tool.hatch.envs.default.extra-scripts]
+pip = "{env:HATCH_UV} pip {args}"
+
+[tool.hatch.envs.default.scripts]
+fmt = "ruff format"
+lint = "ruff check"
+check = [
+    "ruff format",
+    "ruff check --fix",
+    "pyright",
+    "mypy",
+]
+
+[tool.hatch.version]
+path = "src/agbench/version.py"
+
+[project.scripts]
+agbench = "agbench.cli:main"
+
+[tool.ruff]
+line-length = 120
+fix = true
+exclude = ["build", "dist", "src/agbench/res/*", "src/agbench/template/*",]
+target-version = "py310"
+include = ["src/**"]
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "B", "Q", "I", "ASYNC"]
+ignore = ["F401", "E501"]
+
+[tool.ruff.lint.flake8-tidy-imports]
+[tool.ruff.lint.flake8-tidy-imports.banned-api]
+"unittest".msg = "Use `pytest` instead."
+
+[tool.mypy]
+files = ["src"]
+
+strict = true
+python_version = "3.10"
+ignore_missing_imports = true
+
+# from https://blog.wolt.com/engineering/2021/09/30/professional-grade-mypy-configuration/
+disallow_untyped_defs = true
+no_implicit_optional = true
+check_untyped_defs = true
+warn_return_any = true
+show_error_codes = true
+warn_unused_ignores = false
+
+disallow_incomplete_defs = true
+disallow_untyped_decorators = true
+disallow_any_unimported = true
+
+[tool.pyright]
+include = ["src"]
+typeCheckingMode = "strict"
+reportUnnecessaryIsInstance = false
+reportMissingTypeStubs = false
--- a/python/tools/agbench/setup.py
+++ b/python/tools/agbench/setup.py
@ -0,0 +1,3 @@
+from setuptools import setup
+
+setup()
--- a/python/tools/agbench/src/agbench/init.py
+++ b/python/tools/agbench/src/agbench/init.py
@ -0,0 +1 @@
+from .version import __version__
--- a/python/tools/agbench/src/agbench/main.py
+++ b/python/tools/agbench/src/agbench/main.py
@ -0,0 +1,4 @@
+from .cli import main
+
+if __name__ == "__main__":
+    main()
--- a/python/tools/agbench/src/agbench/cli.py
+++ b/python/tools/agbench/src/agbench/cli.py
@ -0,0 +1,108 @@
+import sys
+from typing import Callable, List, Optional, Sequence
+
+from typing_extensions import TypedDict
+
+from .run_cmd import run_cli
+from .tabulate_cmd import tabulate_cli
+from .version import __version__
+
+
+class CommandSpec(TypedDict):
+    command: str
+    description: str
+    function: Optional[Callable[[Sequence[str]], None]]
+
+
+def main(args: Optional[List[str]] = None) -> None:
+    if args is None:
+        args = sys.argv[:]  # Shallow copy
+
+    invocation_cmd = "autogenbench"
+    version_string = f"AutoGenBench version {__version__}"
+
+    commands: List[CommandSpec] = [
+        {
+            "command": "run",
+            "description": "run a given benchmark configuration",
+            "function": run_cli,
+        },
+        {
+            "command": "tabulate",
+            "description": "tabulate the results of a previous run",
+            "function": tabulate_cli,
+        },
+        {
+            "command": "--version",
+            "description": f"print the version of {invocation_cmd}",
+            "function": lambda _args: print(f"{version_string}"),
+        },
+        {"command": "--help", "description": "print this message", "function": None},
+    ]
+
+    # Some help string formatting
+    commands_list = ", ".join(["'" + c["command"] + "'" for c in commands])
+    max_command_len = max([len(c["command"]) for c in commands])
+    commands_details = ""
+    for c in commands:
+        padded_cmd = c["command"]
+        while len(padded_cmd) < max_command_len:
+            padded_cmd = " " + padded_cmd
+        commands_details += f"    {padded_cmd}: {c['description']}\n"
+
+    usage_text = f"""
+{version_string}
+
+usage: {invocation_cmd} COMMAND ARGS
+
+Where, COMMAND is one of: {commands_list}
+
+and ARGS are specific to the command.
+(use '{invocation_cmd} COMMAND --help' for command-specific help)
+""".strip()
+
+    help_text = f"""
+{version_string}
+
+usage: {invocation_cmd} COMMAND ARGS
+
+{invocation_cmd} is a tool for running and managing AutoGen benchmark scenarios. A typically session might resemble:
+
+    {invocation_cmd} clone HumanEval
+    cd HumanEval
+    {invocation_cmd} run Tasks/human_eval_two_agents_gpt4.jsonl
+
+which will download the HumanEval benchmark, expand it, and then run the benchmark once with the `human_eval_two_agents_gpt4` configuration.
+
+Available COMMANDs include:
+
+{commands_details}
+
+Additionally, you can use the --help option with any command for further command-specific instructions. E.g.,
+
+    {invocation_cmd} run --help
+    {invocation_cmd} clone --help
+
+""".strip()
+
+    if len(args) < 2:
+        sys.stderr.write(usage_text + "\n")
+        sys.exit(2)
+
+    for command in commands:
+        if args[1].lower() == command["command"]:
+            if command["function"] is None:
+                sys.stderr.write(help_text + "\n")
+                sys.exit(0)
+            else:
+                command["function"]([invocation_cmd + " " + command["command"]] + args[2:])
+                sys.exit(0)
+
+    # Command not found
+    sys.stderr.write(f"Invalid command '{args[1]}'. Available commands include: {commands_list}\n")
+    sys.exit(2)
+
+
+###############################################################################
+if __name__ == "__main__":
+    main()
--- a/python/tools/agbench/src/agbench/load_module.py
+++ b/python/tools/agbench/src/agbench/load_module.py
@ -0,0 +1,16 @@
+import importlib.util
+import os
+import sys
+from types import ModuleType
+
+
+def load_module(module_path: str) -> ModuleType:
+    module_name = os.path.basename(module_path).replace(".py", "")
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    if spec is None:
+        raise ValueError(f"Could not load module from path: {module_path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    assert spec.loader is not None
+    spec.loader.exec_module(module)
+    return module
--- a/python/tools/agbench/src/agbench/res/Dockerfile
+++ b/python/tools/agbench/src/agbench/res/Dockerfile
@ -0,0 +1,45 @@
+FROM python:3.11
+MAINTAINER AutoGen
+
+# Install packages
+RUN apt-get update && apt-get install ffmpeg exiftool -y
+
+# Set the image to the Pacific Timezone
+RUN ln -snf /usr/share/zoneinfo/US/Pacific /etc/localtime && echo "US/Pacific" > /etc/timezone
+
+# Upgrade pip
+RUN pip install --upgrade pip
+
+# Pre-load autogen dependencies, but not autogen itself since we'll often want to install the latest from source
+RUN pip install pyautogen[teachable,lmm,graphs,websurfer]
+RUN pip uninstall --yes pyautogen
+
+# Pre-load popular packages as per https://learnpython.com/blog/most-popular-python-packages/
+RUN pip install numpy pandas matplotlib seaborn scikit-learn requests urllib3 nltk pillow pytest
+
+# Pre-load packages needed for complex_task file utils
+RUN pip install python-docx pdfminer.six requests pillow easyocr python-pptx SpeechRecognition pandas openpyxl pydub mammoth puremagic youtube_transcript_api==0.6.0
+
+# Pre-load Selenium and Playwright
+RUN pip install selenium playwright
+
+# Chromium playwright
+RUN playwright install --with-deps chromium
+RUN playwright install --with-deps firefox
+
+# Chrome for Selenium (need to run dpkg twice to resolve dependencies)
+# RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
+# RUN dpkg -i google-chrome-stable_current_amd64.deb || :
+# RUN apt -f install -y
+# RUN dpkg -i google-chrome-stable_current_amd64.deb
+
+# Fix an incompatibility with numpy
+RUN pip uninstall --yes numpy
+RUN pip install "numpy<2.0"
+
+# Pre-load the OCR model
+RUN /usr/bin/echo -e "import easyocr\nreader = easyocr.Reader(['en'])" | python
+
+# Webarena
+RUN pip install beartype aiolimiter
+RUN /usr/bin/echo -e "import nltk\nnltk.download('punkt')" | python
--- a/python/tools/agbench/src/agbench/run_cmd.py
+++ b/python/tools/agbench/src/agbench/run_cmd.py
@ -0,0 +1,728 @@
+import argparse
+import errno
+import json
+import logging
+import os
+import pathlib
+import random
+import shutil
+import subprocess
+import sys
+import time
+import traceback
+from typing import Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
+
+import docker
+from azure.core.exceptions import ClientAuthenticationError
+from azure.identity import DefaultAzureCredential, get_bearer_token_provider
+from docker.errors import APIError, DockerException, ImageNotFound
+from docker.models.containers import Container
+from typing_extensions import TypedDict
+
+from .version import __version__
+
+# Figure out where everything is
+SCRIPT_PATH = os.path.realpath(__file__)
+SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
+SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
+
+TASK_TIMEOUT = 60 * 120  # 120 minutes
+
+BASE_TEMPLATE_PATH = os.path.join(SCRIPT_DIR, "template")
+RESOURCES_PATH = os.path.join(SCRIPT_DIR, "res")
+
+# What platform are we running?
+IS_WIN32 = sys.platform == "win32"
+
+# This is the tag given to the image that is *built* when no other image is provided.
+# Do not use this field to specify the name of an existing image (e.g., on Dockerhub)
+DEFAULT_DOCKER_IMAGE_TAG = "agbench"
+
+DEFAULT_ENV_FILE = "ENV.json"
+
+# Get a random number generator for subsampling
+subsample_rng = random.Random(425)
+
+
+class ScenarioInstance(TypedDict):
+    id: str
+    template: Union[str, List[Union[str, List[str]]]]
+    substitutions: Dict[str, Dict[str, str]]
+    values: Dict[str, Dict[str, str]]
+
+
+def run_scenarios(
+    scenario: str,
+    n_repeats: int,
+    is_native: bool,
+    token_provider: Optional[Callable[[], str]],
+    docker_image: Optional[str] = None,
+    results_dir: str = "Results",
+    subsample: Union[None, int, float] = None,
+) -> None:
+    """
+    Run a set autogenbench scenarios a given number of times.
+
+    Args:
+        scenario (path):    The file or folder containing the scenario JSONL instances. If given a folder, then
+                            all JSONL files in the folder will be loaded and run.
+        n_repeats (int):    The number of times each scenario instance will be repeated
+        is_native (bool):   True if the scenario should be run locally rather than in Docker (proceed with caution!)
+        results_dir (path): The folder were results will be saved.
+    """
+
+    files: List[str] = []
+
+    # Figure out which files or folders we are working with
+    if scenario == "-" or os.path.isfile(scenario):
+        files.append(scenario)
+    elif os.path.isdir(scenario):
+        for f in os.listdir(scenario):
+            scenario_file = os.path.join(scenario, f)
+
+            if not os.path.isfile(scenario_file):
+                continue
+
+            if not scenario_file.lower().endswith(".jsonl"):
+                continue
+
+            files.append(scenario_file)
+    else:
+        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), scenario)
+
+    # Run all the scenario files
+    for scenario_file in files:
+        scenario_name: Optional[str] = None
+        scenario_dir: Optional[str] = None
+        file_handle = None
+
+        # stdin
+        if scenario_file == "-":
+            scenario_name = "stdin"
+            scenario_dir = "."
+            file_handle = sys.stdin
+        else:
+            scenario_name_parts = os.path.basename(scenario_file).split(".")
+            scenario_name_parts.pop()
+            scenario_name = ".".join(scenario_name_parts)
+            scenario_dir = os.path.dirname(os.path.realpath(scenario_file))
+            file_handle = open(scenario_file, "rt")
+
+        # Read all the lines, then subsample if needed
+        lines = [line for line in file_handle]
+        if subsample is not None:
+            # How many lines are we sampling
+            n = 0
+            # It's a proportion
+            if 0 <= subsample < 1:
+                n = int(len(lines) * subsample + 0.5)
+            # It's a raw count
+            else:
+                n = int(subsample)
+            n = max(0, min(n, len(lines)))
+            lines = subsample_rng.sample(lines, n)
+
+        for line in lines:
+            instance = json.loads(line)
+
+            # Create a folder to store the results
+            # Results base
+            if not os.path.isdir(results_dir):
+                os.mkdir(results_dir)
+
+            # Results for the scenario
+            results_scenario = os.path.join(results_dir, scenario_name)
+            if not os.path.isdir(results_scenario):
+                os.mkdir(results_scenario)
+
+            # Results for the instance
+            results_instance = os.path.join(results_scenario, instance["id"])
+            if not os.path.isdir(results_instance):
+                os.mkdir(results_instance)
+
+            # Results for the repeats
+            for i in range(0, n_repeats):
+                results_repetition = os.path.join(results_instance, str(i))
+
+                # Skip it if it already exists
+                if os.path.isdir(results_repetition):
+                    print(f"Found folder {results_repetition} ... Skipping.")
+                    continue
+                print(f"Running scenario {results_repetition}")
+
+                # Expand the scenario
+                expand_scenario(scenario_dir, instance, results_repetition)
+
+                # Prepare the environment (keys/values that need to be added)
+                env = get_scenario_env(token_provider)
+
+                # Run the scenario
+                if is_native:
+                    run_scenario_natively(results_repetition, env)
+                else:
+                    run_scenario_in_docker(
+                        results_repetition,
+                        env,
+                        docker_image=docker_image,
+                    )
+
+        # Close regular files
+        if scenario_file != "-":
+            file_handle.close()
+
+
+def expand_scenario(scenario_dir: str, scenario: ScenarioInstance, output_dir: str) -> None:
+    """
+    Expand a scenario into a folder.
+    Despite some awkwardness created by backwards compatibility and notational conveniences, expansion is conceptually simple.
+    It is a series of copy commands (similar to `cp -R`), followed by a series of in-place fine and replace operations.
+    """
+
+    template = scenario["template"]
+
+    # Either key works for finding the substiturions list. "values" may be deprecated in the future
+    substitutions = scenario["substitutions"] if "substitutions" in scenario else scenario["values"]
+
+    # Older versions are only one-level deep. Convert them,
+    if len(substitutions) > 0 and isinstance(substitutions[next(iter(substitutions))], str):
+        substitutions = {"scenario.py": cast(Dict[str, str], substitutions)}
+
+    copy_operations: List[Tuple[str, str]] = []
+
+    # Handle file (str), folder (str), or mapping (List) templates
+    if isinstance(template, str):
+        template_path = os.path.join(scenario_dir, template)
+        if os.path.isdir(template_path):
+            copy_operations.append((template, ""))
+        else:
+            copy_operations.append((template, "scenario.py"))
+    elif isinstance(template, list):
+        for elm in template:
+            if isinstance(elm, list):
+                copy_operations.append((elm[0], elm[1]))
+            else:
+                copy_operations.append((elm, ""))
+    else:
+        raise ValueError("expand_scenario expects an str or list for 'template'")
+
+    # The global includes folder is always copied
+    shutil.copytree(
+        BASE_TEMPLATE_PATH,
+        output_dir,
+        ignore=shutil.ignore_patterns("*.example"),
+        dirs_exist_ok=False,
+    )
+
+    # Expand other folders
+    for items in copy_operations:
+        src_path = pathlib.Path(os.path.join(scenario_dir, items[0])).absolute()
+        dest_path = pathlib.Path(os.path.join(output_dir, items[1])).absolute()
+
+        if os.path.isdir(src_path):
+            shutil.copytree(src_path, dest_path, dirs_exist_ok=True)
+        else:
+            if os.path.isdir(dest_path):
+                # If the destination is a directory, use the same filename
+                shutil.copyfile(src_path, os.path.join(dest_path, os.path.basename(src_path)))
+            else:
+                # Otherwuse use the filename provided
+                shutil.copyfile(src_path, dest_path)
+
+    # Expand templated files
+    for templated_file in substitutions.keys():  # Keys are relative file paths
+        # Read the templated file into memory
+        template_contents: List[str] = list()
+        with open(os.path.join(output_dir, templated_file), "rt") as fh:
+            for line in fh:
+                template_contents.append(line)
+
+        # Rewrite the templated file with substitutions
+        values = substitutions[templated_file]
+        with open(os.path.join(output_dir, templated_file), "wt") as fh:
+            for line in template_contents:
+                for k, v in values.items():
+                    line = line.replace(k, v)
+                fh.write(line)
+
+
+def get_scenario_env(token_provider: Optional[Callable[[], str]], env_file: str = DEFAULT_ENV_FILE) -> Dict[str, str]:
+    """
+    Return a dictionary of environment variables needed to run a scenario.
+
+    Args:
+        config_list (list): An Autogen OAI_CONFIG_LIST to be used when running scenarios.
+        env_file (str): The path to the env_file to read. (default: DEFAULT_ENV_FILE)
+
+    Returns: A dictionary of keys and values that need to be added to the system environment.
+    """
+    env: Dict[str, str] = dict()
+
+    # Populate with commonly needed keys
+    openai_api_key = os.environ.get("OPENAI_API_KEY")
+    if openai_api_key is not None and len(openai_api_key.strip()) > 0:
+        env["OPENAI_API_KEY"] = openai_api_key
+
+    bing_api_key = os.environ.get("BING_API_KEY")
+    if bing_api_key is not None and len(bing_api_key.strip()) > 0:
+        env["BING_API_KEY"] = bing_api_key
+
+    ## Support Azure auth tokens
+    azure_openai_ad_token = os.environ.get("AZURE_OPENAI_AD_TOKEN")
+    if not azure_openai_ad_token and token_provider:
+        azure_openai_ad_token = token_provider()
+
+    if azure_openai_ad_token is not None and len(azure_openai_ad_token.strip()) > 0:
+        env["AZURE_OPENAI_AD_TOKEN"] = azure_openai_ad_token
+
+    # Update with any values from the ENV.json file
+    if os.path.isfile(env_file):
+        with open(env_file, "rt") as fh:
+            env.update(json.loads(fh.read()))
+
+    return env
+
+
+def run_scenario_natively(work_dir: str, env: Mapping[str, str], timeout: int = TASK_TIMEOUT) -> None:
+    """
+    Run a scenario in the native environment.
+
+    Args:
+        work_dir (path): the path to the working directory previously created to house this sceario instance
+    """
+
+    # Get the current working directory
+    cwd = os.getcwd()
+
+    # Prepare the environment variables
+    full_env = os.environ.copy()
+    full_env.update(env)
+
+    # Navigate to the scenario
+    os.chdir(work_dir)
+    print("\n\n" + os.getcwd() + "\n===================================================================")
+
+    # Prepare the run script
+    with open(os.path.join("run.sh"), "wt") as f:
+        f.write(
+            f"""#
+echo RUN.SH STARTING !#!#
+export AUTOGEN_TESTBED_SETTING="Native"
+echo "autogenbench version: {__version__}" > timestamp.txt
+
+# Create and activate the virtual environment
+# This is called in a subprocess, and will not impact the parent
+{sys.executable} -m venv .autogenbench_venv
+. .autogenbench_venv/bin/activate
+
+# Run the global init script if it exists
+if [ -f global_init.sh ] ; then
+    . ./global_init.sh
+fi
+
+# Run the scenario init script if it exists
+if [ -f scenario_init.sh ] ; then
+    . ./scenario_init.sh
+fi
+
+# Run the scenario
+pip install -r requirements.txt
+echo SCENARIO.PY STARTING !#!#
+timeout --preserve-status --kill-after {timeout  + 30}s {timeout}s python scenario.py
+EXIT_CODE=$?
+if [ $EXIT_CODE -ne 0 ]; then
+    echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
+else
+    echo SCENARIO.PY COMPLETE !#!#
+fi
+
+# Clean up
+if [ -d .cache ] ; then
+    rm -Rf .cache
+fi
+
+if [ -d __pycache__ ] ; then
+    rm -Rf __pycache__
+fi
+
+# Run the scenario finalize script if it exists
+if [ -f scenario_finalize.sh ] ; then
+    . ./scenario_finalize.sh
+fi
+
+# Run the global finalize script if it exists
+if [ -f global_finalize.sh ] ; then
+    . ./global_finalize.sh
+fi
+
+# We don't need to deactivate the venv because it's
+# contained in the subprocess; but we should clean it up
+if [ -d .autogenbench_venv ] ; then
+    rm -Rf .autogenbench_venv
+fi
+
+echo RUN.SH COMPLETE !#!#
+"""
+        )
+
+    # Run the script and log the output
+    with open("console_log.txt", "wb") as f:
+        process = subprocess.Popen(
+            ["sh", "run.sh"],
+            env=full_env,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+        )
+        for c in iter(lambda: process.stdout.read(1), b""):  # type: ignore
+            f.write(c)
+            os.write(sys.stdout.fileno(), c)  # Write binary to stdout
+
+    # Return where we started
+    os.chdir(cwd)
+    return
+
+
+def run_scenario_in_docker(
+    work_dir: str, env: Mapping[str, str], timeout: int = TASK_TIMEOUT, docker_image: Optional[str] = None
+) -> None:
+    """
+    Run a scenario in a Docker environment.
+
+    Args:
+        work_dir (path): the path to the working directory previously created to house this sceario instance
+        timeout (Optional, int): the number of seconds to allow a Docker container to run before timing out
+    """
+
+    client = docker.from_env()
+    image = None
+
+    # If the docker_image is None, then we will fetch DEFAULT_DOCKER_IMAGE_TAG, if present,
+    # or build it if missing.
+    if docker_image is None:
+        # Pull a suitable image
+        try:
+            image = client.images.get(DEFAULT_DOCKER_IMAGE_TAG)
+        except ImageNotFound:
+            print(f"Building default Docker image '{DEFAULT_DOCKER_IMAGE_TAG}'. This may take a few minutes...")
+            try:
+                build_default_docker_image(client, DEFAULT_DOCKER_IMAGE_TAG)
+                image = client.images.get(DEFAULT_DOCKER_IMAGE_TAG)
+            except DockerException:
+                print(f"Failed to build image '{DEFAULT_DOCKER_IMAGE_TAG}'")
+
+    # Otherwise get the requested image
+    else:
+        try:
+            image = client.images.get(docker_image)
+        except ImageNotFound:
+            # pull the image
+            print(f"Pulling image '{docker_image}'")
+            try:
+                image = client.images.pull(docker_image)
+            except DockerException:
+                print(f"Failed to pull image '{docker_image}'")
+
+    # Prepare the run script
+    with open(os.path.join(work_dir, "run.sh"), "wt", newline="\n") as f:
+        f.write(
+            f"""#
+echo RUN.SH STARTING !#!#
+export AUTOGEN_TESTBED_SETTING="Docker"
+
+umask 000
+echo "autogenbench version: {__version__}" > timestamp.txt
+
+# Run the global init script if it exists
+if [ -f global_init.sh ] ; then
+    . ./global_init.sh
+fi
+
+# Run the scenario init script if it exists
+if [ -f scenario_init.sh ] ; then
+    . ./scenario_init.sh
+fi
+
+# Run the scenario
+pip install -r requirements.txt
+echo SCENARIO.PY STARTING !#!#
+timeout --preserve-status --kill-after {timeout  + 30}s {timeout}s python scenario.py
+EXIT_CODE=$?
+if [ $EXIT_CODE -ne 0 ]; then
+    echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
+else
+    echo SCENARIO.PY COMPLETE !#!#
+fi
+
+# Clean up
+if [ -d .cache ] ; then
+    rm -Rf .cache
+fi
+
+if [ -d __pycache__ ] ; then
+    rm -Rf __pycache__
+fi
+
+# Run the scenario finalize script if it exists
+if [ -f scenario_finalize.sh ] ; then
+    . ./scenario_finalize.sh
+fi
+
+# Run the global finalize script if it exists
+if [ -f global_finalize.sh ] ; then
+    . ./global_finalize.sh
+fi
+
+echo RUN.SH COMPLETE !#!#
+"""
+        )
+
+    # Figure out what folders to mount
+    volumes = {str(pathlib.Path(work_dir).absolute()): {"bind": "/workspace", "mode": "rw"}}
+
+    # Add the autogen repo if we can find it
+    autogen_repo_base = os.environ.get("AUTOGENBENCH_REPO_BASE")
+    if autogen_repo_base is None:
+        autogen_repo_base = find_autogen_repo(os.getcwd())
+    elif not os.path.isdir(autogen_repo_base):
+        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), autogen_repo_base)
+
+    if autogen_repo_base is not None:
+        volumes[str(pathlib.Path(autogen_repo_base).absolute())] = {"bind": "/agnext", "mode": "rw"}
+
+    print("Mounting:")
+    for k in volumes:
+        bind = volumes[k]["bind"]
+        mode = volumes[k]["mode"].upper()
+        if bind == "/workspace":
+            k = os.path.relpath(k)
+        print(f"[{mode}]\t'{k}' => '{bind}'")
+    print("===================================================================")
+
+    assert image is not None
+    # Create and run the container
+    container: Container = cast(
+        Container,
+        client.containers.run(
+            image,
+            command=["sh", "run.sh"],
+            working_dir="/workspace",
+            environment=dict(env),
+            detach=True,
+            remove=True,
+            auto_remove=True,
+            # Type hint of docker is wrong here
+            volumes=volumes,  # type: ignore
+        ),
+    )
+
+    # Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
+    docker_timeout: float = timeout + 60  # One full minute after the bash timeout command should have already triggered
+    start_time = time.time()
+    logs = container.logs(stream=True)
+    log_file = open(os.path.join(work_dir, "console_log.txt"), "wt", encoding="utf-8")
+    stopping = False
+    exiting = False
+
+    while True:
+        try:
+            chunk = cast(bytes, next(logs))  # Manually step the iterator so it is captures with the try-catch
+
+            # Stream the data to the log file and the console
+            chunk_str = chunk.decode("utf-8")
+            log_file.write(chunk_str)
+            log_file.flush()
+            sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+            sys.stdout.write(chunk_str)
+            sys.stdout.flush()
+
+            # Check if we need to terminate
+            if not stopping and time.time() - start_time >= docker_timeout:
+                container.stop()
+
+                # Don't exit the loop right away, as there are things we may still want to read from the logs
+                # but remember how we got here.
+                stopping = True
+        except KeyboardInterrupt:
+            log_file.write("\nKeyboard interrupt (Ctrl-C). Attempting to exit gracefully.\n")
+            log_file.flush()
+            sys.stdout.write("\nKeyboard interrupt (Ctrl-C). Attempting to exit gracefully.\n")
+            sys.stdout.flush()
+
+            # Start the exit process, and give it a minute, but keep iterating
+            container.stop()
+            exiting = True
+            docker_timeout = time.time() - start_time + 60
+        except StopIteration:
+            break
+
+    # Clean up the container
+    try:
+        container.remove()
+    except APIError:
+        pass
+
+    if stopping:  # By this line we've exited the loop, and the container has actually stopped.
+        log_file.write("\nDocker timed out.\n")
+        log_file.flush()
+        sys.stdout.write("\nDocker timed out.\n")
+        sys.stdout.flush()
+
+    if exiting:  # User hit ctrl-C
+        sys.exit(1)
+
+
+def build_default_docker_image(docker_client: docker.DockerClient, image_tag: str) -> None:
+    for segment in docker_client.api.build(
+        path=RESOURCES_PATH,
+        dockerfile="Dockerfile",
+        rm=True,
+        tag=image_tag,
+        decode=True,
+    ):
+        if "stream" in segment:
+            sys.stdout.write(segment["stream"])
+
+
+def find_autogen_repo(path: str) -> Optional[str]:
+    """
+    Utility for identifying if the path is a subdirectory of the autogen repo.
+
+    Returns: the path to the root of the autogen repo if one is found, otherwise None
+    """
+
+    # Normalize the path (we expect a directory)
+    path = os.path.abspath(path)
+    if os.path.isfile(path):
+        path = os.path.dirname(path)
+
+    while True:
+        test_path = os.path.join(path, "python", "src", "agnext")  # We found agnext
+        if os.path.isdir(test_path):
+            return os.path.join(path, "python")
+
+        # Stop if we hit the root
+        parent_dir = os.path.abspath(os.path.join(path, os.pardir))
+        if parent_dir == path:
+            break
+
+        # Keep searching
+        path = parent_dir
+
+    return None
+
+
+def run_cli(args: Sequence[str]) -> None:
+    invocation_cmd = args[0]
+    args = args[1:]
+
+    # Prepare the argument parser
+    parser = argparse.ArgumentParser(
+        prog=invocation_cmd,
+        description=f"{invocation_cmd} will run the specified AutoGen scenarios for a given number of repetitions and record all logs and trace information. When running in a Docker environment (default), each run will begin from a common, tightly controlled, environment. The resultant logs can then be further processed by other scripts to produce metrics.".strip(),
+    )
+
+    parser.add_argument(
+        "scenario",
+        help="The JSONL scenario file to run. If a directory is specified, then all JSONL scenarios in the directory are run. If set to '-', then read from stdin.",
+    )
+    parser.add_argument(
+        "-r",
+        "--repeat",
+        type=int,
+        help="The number of repetitions to run for each scenario (default: 1).",
+        default=1,
+    )
+    parser.add_argument(
+        "-s",
+        "--subsample",
+        type=str,
+        help='Run on a subsample of the tasks in the JSONL file(s). If a decimal value is specified, then run on the given proportion of tasks in each file. For example "0.7" would run on 70%% of tasks, and "1.0" would run on 100%% of tasks. If an integer value is specified, then randomly select *that* number of tasks from each specified JSONL file. For example "7" would run tasks, while "1" would run only 1 task from each specified JSONL file. (default: 1.0; which is 100%%)',
+        default=None,
+    )
+    parser.add_argument(
+        "-d",
+        "--docker-image",
+        type=str,
+        help="The Docker image to use when running scenarios. Can not be used together with --native. (default: '"
+        + DEFAULT_DOCKER_IMAGE_TAG
+        + "', which will be created if not present)",
+        default=None,
+    )
+    parser.add_argument(
+        "--native",
+        action="store_true",
+        help="Run the scenarios natively rather than in docker. NOTE: This is not advisable, and should be done with great caution.",
+    )
+
+    parsed_args = parser.parse_args(args)
+
+    # Don't allow both --docker-image and --native on the same command
+    if parsed_args.docker_image is not None and parsed_args.native:
+        sys.exit("The options --native and --docker-image can not be used together. Exiting.")
+
+    # Warn if running natively
+    if parsed_args.native:
+        if IS_WIN32:
+            sys.exit("Running scenarios with --native is not supported in Windows. Exiting.")
+
+        if parsed_args.requirements is not None:
+            sys.exit("--requirements is not compatible with --native. Exiting.")
+
+        sys.stderr.write(
+            "WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n"
+        )
+
+        # Does an environment variable override the prompt?
+        allow_native = os.environ.get("AUTOGENBENCH_ALLOW_NATIVE")
+        if allow_native is None or allow_native == "":
+            choice = input(
+                'Are you absolutely sure you want to continue with native execution? Type "Yes" exactly, and in full, to proceed: '
+            )
+            if choice.strip().lower() != "yes":
+                sys.exit("Received '" + choice + "'. Exiting.")
+        elif allow_native.strip().lower() != "yes":
+            sys.exit(f"Exiting because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
+        else:
+            sys.stderr.write(f"Continuing because AUTOGENBENCH_ALLOW_NATIVE is '{allow_native}'\n")
+            time.sleep(0.75)  # Pause very briefly so the message isn't lost in the noise
+
+    # Parse the subsample
+    subsample = None
+    if parsed_args.subsample is not None:
+        subsample = float(parsed_args.subsample)
+        if "." in parsed_args.subsample:  # Intention is to run on a proportion
+            if subsample == 1.0:  # Intention is to run 100%, which is the default
+                subsample = None  # None means 100% ... which use None to differentiate from the integer 1
+            elif subsample < 0 or subsample > 1.0:
+                raise (
+                    ValueError(
+                        "Subsample must either be an integer (specified without a decimal), or a Real number between 0.0 and 1.0"
+                    )
+                )
+
+    # Get the Azure bearer token generator if a token wasn't provided and there's any evidence of using Azure
+    azure_token_provider = None
+    if not os.environ.get("AZURE_OPENAI_AD_TOKEN") and os.path.isdir(pathlib.Path("~/.azure").expanduser()):
+        logging.disable(logging.CRITICAL)
+        try:
+            azure_token_provider = get_bearer_token_provider(
+                DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"
+            )
+            azure_token_provider()  # Call it once to warm it up, and make sure it doesn't throw an error
+            print("Found Azure token provider.")
+        except ClientAuthenticationError:
+            error_message = traceback.format_exc()
+            azure_token_provider = None
+            print(
+                f"Azure token provider failed loading. Try using 'az login --use-device-code':\n{error_message}\n\nContinuing without Azure token provider..."
+            )
+        logging.disable(logging.NOTSET)
+
+    # Run the scenario
+    run_scenarios(
+        scenario=parsed_args.scenario,
+        n_repeats=parsed_args.repeat,
+        is_native=True if parsed_args.native else False,
+        token_provider=azure_token_provider,
+        docker_image=parsed_args.docker_image,
+        subsample=subsample,
+    )
--- a/python/tools/agbench/src/agbench/tabulate_cmd.py
+++ b/python/tools/agbench/src/agbench/tabulate_cmd.py
@ -0,0 +1,236 @@
+import argparse
+import os
+import sys
+from copy import deepcopy
+from typing import Any, Callable, List, Optional, Sequence, Tuple
+
+import tabulate as tb
+
+from .load_module import load_module
+
+# Figure out where everything is
+SCRIPT_PATH = os.path.realpath(__file__)
+SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
+SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
+
+TABULATE_FILE = "custom_tabulate.py"
+
+SUCCESS_STRINGS = [
+    "ALL TESTS PASSED !#!#",
+]
+
+EXCLUDE_DIR_NAMES = ["__pycache__"]
+
+
+def find_tabulate_module(search_dir: str, stop_dir: Optional[str] = None) -> Optional[str]:
+    """Hunt for the tabulate script."""
+
+    search_dir = os.path.abspath(search_dir)
+    if not os.path.isdir(search_dir):
+        raise ValueError(f"'{search_dir}' is not a directory.")
+
+    stop_dir = None if stop_dir is None else os.path.abspath(stop_dir)
+
+    while True:
+        path = os.path.join(search_dir, TABULATE_FILE)
+        if os.path.isfile(path):
+            return path
+
+        path = os.path.join(search_dir, "Scripts", TABULATE_FILE)
+        if os.path.isfile(path):
+            return path
+
+        path = os.path.join(search_dir, "scripts", TABULATE_FILE)
+        if os.path.isfile(path):
+            return path
+
+        # Stop if we hit the stop_dir
+        if search_dir == stop_dir:
+            break
+
+        # Stop if we hit the root
+        parent_dir = os.path.abspath(os.path.join(search_dir, os.pardir))
+        if parent_dir == search_dir:
+            break
+
+        search_dir = parent_dir
+
+    return None
+
+
+def default_scorer(instance_dir: str, success_strings: List[str] = SUCCESS_STRINGS) -> Optional[bool]:
+    console_log = os.path.join(instance_dir, "console_log.txt")
+    if os.path.isfile(console_log):
+        with open(console_log, "rt") as fh:
+            content = fh.read()
+            for s in success_strings:
+                if s in content:
+                    return True
+            return False
+    else:
+        return None
+
+
+ScorerFunc = Callable[[str], Optional[bool]]
+
+
+def default_tabulate(
+    args: List[str], scorer: ScorerFunc = default_scorer, exclude_dir_names: List[str] = EXCLUDE_DIR_NAMES
+) -> Tuple[argparse.Namespace, List[List[Any]]]:
+    invocation_cmd = args[0]
+    args = args[1:]
+
+    warning = f"CAUTION: '{invocation_cmd}' is in early preview and is not thoroughly tested.\nPlease do not cite values from these calculations in academic work without first inspecting and verifying the results in the run logs yourself."
+
+    # Prepare the argument parser
+    parser = argparse.ArgumentParser(
+        prog=invocation_cmd,
+        description=f"{invocation_cmd} will tabulate the results of a previous run.",
+    )
+
+    parser.add_argument(
+        "runlogs",
+        help="The path where the run's logs are stored.",
+    )
+    parser.add_argument(
+        "-c",
+        "--csv",
+        action="store_true",
+        help="Output the results in CSV format.",
+    )
+
+    parser.add_argument(
+        "-e", "--excel", help="Output the results in Excel format. Please specify a path for the Excel file.", type=str
+    )
+
+    parsed_args = parser.parse_args(args)
+    runlogs: str = parsed_args.runlogs
+
+    all_results: List[List[Any]] = list()
+    max_instances = 0
+
+    for task_id in sorted(
+        os.listdir(runlogs),
+        key=lambda s: os.path.getmtime(os.path.join(runlogs, s)),
+    ):
+        if task_id in exclude_dir_names:
+            continue
+
+        task_path = os.path.join(runlogs, task_id)
+
+        if not os.path.isdir(task_path):
+            continue
+
+        # Collect the results vector
+        results: List[Any] = [task_id]
+
+        instance = 0
+        instance_dir = os.path.join(task_path, str(instance))
+        while os.path.isdir(instance_dir):
+            results.append(scorer(instance_dir))
+            instance += 1
+            instance_dir = os.path.join(task_path, str(instance))
+
+        max_instances = max(max_instances, instance)
+
+        # Buffer the results
+        all_results.append(results)
+
+    if parsed_args.csv:
+        # Create a header
+        header = ["Task Id"]
+        for i in range(0, max_instances):
+            header.append("Trial " + str(i) + " Success")
+
+        print(",".join(header))
+        for row in all_results:
+            str_row = [f"{v}" if v is not None else "" for v in row]
+            while len(str_row) < max_instances + 1:
+                str_row.append("")
+            print(",".join(str_row))
+
+        # Print out alpha-version warning
+        sys.stderr.write("\n" + warning + "\n\n")
+    else:
+        # Create a header
+        header = ["\nTask Id"]
+        for i in range(0, max_instances):
+            header.append("Trial " + str(i) + "\nSuccess")
+
+        # Create the footer
+        def _count_equals(value: Optional[bool], trial: int) -> int:
+            count = 0
+            for row in all_results:
+                is_answer_matched = row[trial + 1][0] if isinstance(row[trial + 1], tuple) else row[trial + 1]
+
+                # Count missing
+                if value is None:
+                    if trial + 1 < len(row):
+                        if is_answer_matched is None:
+                            count += 1
+                    else:
+                        count += 1
+                # Count match
+                elif trial + 1 < len(row) and is_answer_matched == value:
+                    count += 1
+            return count
+
+        footer: List[Any] = []
+        footer_row: List[Any] = ["Successes"]
+        for i in range(0, max_instances):
+            footer_row.append(_count_equals(True, i))
+        footer.append(footer_row)
+
+        footer_row = ["Failures"]
+        for i in range(0, max_instances):
+            footer_row.append(_count_equals(False, i))
+        footer.append(footer_row)
+
+        footer_row = ["Missing"]
+        for i in range(0, max_instances):
+            footer_row.append(_count_equals(None, i))
+        footer.append(footer_row)
+
+        footer_row = ["Total"]
+        for i in range(0, max_instances):
+            footer_row.append(footer[0][i + 1] + footer[1][i + 1] + footer[2][i + 1])
+        footer.append(footer_row)
+
+        table = deepcopy(all_results)
+        for row in table:
+            for trial in range(0, max_instances):
+                if isinstance(row[trial + 1], tuple):
+                    row[trial + 1] = row[trial + 1][0]
+
+        table.append(tb.SEPARATING_LINE)  # type: ignore
+        table.extend(footer)
+
+        print(tb.tabulate(table, headers=header))
+
+        # Print out alpha-version warning
+        sys.stderr.write("\n" + warning + "\n\n")
+    return parsed_args, all_results
+
+
+def tabulate_cli(args: Sequence[str]) -> None:
+    invocation_cmd = args[0]
+    args = args[1:]
+
+    # We won't assume much about the arguments, letting the dynamically-loaded
+    # tabulate modules parse the arguments however they want. But, we will use
+    # bare arguments (not starting a "-"), to help us find what module to load.
+    module_path = find_tabulate_module(os.getcwd(), stop_dir=os.getcwd())
+    for arg in reversed(args):
+        if module_path is not None:
+            break
+        if arg.startswith("-"):
+            continue
+        module_path = find_tabulate_module(arg)
+
+    # Load the module and hand over control
+    if module_path is None:
+        sys.stderr.write("Using default tabulation method.\n\n")
+        default_tabulate([invocation_cmd] + list(args))
+    else:
+        sys.stderr.write(f"Using tabulation method defined in '{module_path}'\n\n")
+        load_module(module_path).main([invocation_cmd] + list(args))
--- a/python/tools/agbench/src/agbench/template/global_finalize.sh
+++ b/python/tools/agbench/src/agbench/template/global_finalize.sh
@ -0,0 +1 @@
+# Global finalize.
--- a/python/tools/agbench/src/agbench/template/global_init.sh
+++ b/python/tools/agbench/src/agbench/template/global_init.sh
@ -0,0 +1 @@
+echo AUTOGEN_TESTBED_SETTING: [$AUTOGEN_TESTBED_SETTING]
--- a/python/tools/agbench/src/agbench/template/requirements.txt
+++ b/python/tools/agbench/src/agbench/template/requirements.txt
--- a/python/tools/agbench/src/agbench/version.py
+++ b/python/tools/agbench/src/agbench/version.py
@ -0,0 +1 @@
+__version__ = "0.0.1a1"
				`@ -0,0 +1 @@`
				`echo AUTOGEN_TESTBED_SETTING: [$AUTOGEN_TESTBED_SETTING]`