Add MagenticOne API and CLI (#4782)

* Add MagenticOne API * Add CodeExecutorAgent to MagenticOne for enhanced task execution * Refactor MagenticOne class to inherit from MagenticOneGroupChat and streamline initialization * Enhance MagenticOne class documentation with detailed usage examples and initialization instructions * Refactor MagenticOne module structure and update import paths * Remove unused imports * Add documentation for MagenticOne module and remove redundant initialization comments * Enhance MagenticOne class with human-in-the-loop mode and update examples * Update MagenticOne class documentation with safety precautions and architecture details * Run poe format * Add blog post reference to MagenticOne class documentation * change default of websurfer use_ocr to false because of refusals * Refactor MagenticOne class to use ChatCompletionClient instead of OpenAIChatCompletionClient * Add client capability validation to MagenticOne initialization * Poe format * Refactor imports in MagenticOne class for clarity and organization * Add stacklevel parameter to warning in client capability validation * Update README to recommend using Magentic-One API for improved integration * Add create_args property to OpenAIChatCompletionClient for better access to initialization arguments * Enhance client capability validation in MagenticOne to ensure compatibility with OpenAI GPT-4o model * Refactor client capability validation in MagenticOne for improved clarity * Update magentic_one.py Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com> * Remove create_args property from OpenAIChatCompletionClient and update validation logic in MagenticOne to directly access _create_args * Refactor documentation in MagenticOne for improved readability and consistency * Refactor client capability validation in MagenticOne to remove unnecessary model check for GPT-4o * Add MagenticOne CLI (#4788) * Add MagenticOne CLI script for task execution with OpenAI GPT-4o integration * Fix argument parsing in MagenticOne CLI to require a single task input * Add docstring to main function in MagenticOne CLI for improved usage clarity * Fix example usage in docstring of MagenticOne CLI for correct argument order * Refactor argument parsing in MagenticOne CLI for improved clarity and consistency * Add type hints to run_task function in MagenticOne CLI * Add type hint for main function in MagenticOne CLI * Remove type ignore from main function call in MagenticOne CLI --------- Co-authored-by: Hussein Mozannar <hmozannar@microsoft.com> Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com>
2025-11-30 08:59:52 +00:00 · 2024-12-23 16:45:21 -08:00 · 2024-12-23 16:45:21 -08:00 · d2537abbab
commit d2537abbab
parent 150a54c4f5
8 changed files with 202 additions and 3 deletions
--- a/python/packages/autogen-core/docs/src/reference/index.md
+++ b/python/packages/autogen-core/docs/src/reference/index.md
@ -45,6 +45,7 @@ python/autogen_ext.agents.web_surfer
 python/autogen_ext.agents.file_surfer
 python/autogen_ext.agents.video_surfer
 python/autogen_ext.agents.video_surfer.tools
+python/autogen_ext.teams.magentic_one
 python/autogen_ext.models.openai
 python/autogen_ext.models.replay
 python/autogen_ext.tools.langchain
--- a/python/packages/autogen-core/docs/src/reference/python/autogen_ext.teams.magentic_one.rst
+++ b/python/packages/autogen-core/docs/src/reference/python/autogen_ext.teams.magentic_one.rst
@ -0,0 +1,8 @@
+autogen\_ext.teams.magentic\_one
+=================================
+
+
+.. automodule:: autogen_ext.teams.magentic_one
+   :members:
+   :undoc-members:
+   :show-inheritance:
--- a/python/packages/autogen-ext/pyproject.toml
+++ b/python/packages/autogen-ext/pyproject.toml
@ -18,6 +18,8 @@ dependencies = [
    "autogen-core==0.4.0.dev11",
 ]

+[project.scripts]
+m1 = "autogen_ext.teams.magentic_one_cli:main"

 [project.optional-dependencies]
 langchain = ["langchain_core~= 0.3.3"]
@ -59,7 +61,6 @@ dev-dependencies = [
    "autogen_test_utils"
 ]

-
 [tool.ruff]
 extend = "../../pyproject.toml"
 include = ["src/**", "tests/*.py"]
--- a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py
+++ b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py
@ -94,7 +94,7 @@ class MultimodalWebSurfer(BaseChatAgent):
        start_page (str, optional): The start page for the browser. Defaults to MultimodalWebSurfer.DEFAULT_START_PAGE.
        animate_actions (bool, optional): Whether to animate actions. Defaults to False.
        to_save_screenshots (bool, optional): Whether to save screenshots. Defaults to False.
-        use_ocr (bool, optional): Whether to use OCR. Defaults to True.
+        use_ocr (bool, optional): Whether to use OCR. Defaults to False.
        browser_channel (str, optional): The browser channel. Defaults to None.
        browser_data_dir (str, optional): The browser data directory. Defaults to None.
        to_resize_viewport (bool, optional): Whether to resize the viewport. Defaults to True.
@ -169,7 +169,7 @@ class MultimodalWebSurfer(BaseChatAgent):
        start_page: str | None = DEFAULT_START_PAGE,
        animate_actions: bool = False,
        to_save_screenshots: bool = False,
-        use_ocr: bool = True,
+        use_ocr: bool = False,
        browser_channel: str | None = None,
        browser_data_dir: str | None = None,
        to_resize_viewport: bool = True,
--- a/python/packages/autogen-ext/src/autogen_ext/teams/init.py
+++ b/python/packages/autogen-ext/src/autogen_ext/teams/init.py
--- a/python/packages/autogen-ext/src/autogen_ext/teams/magentic_one.py
+++ b/python/packages/autogen-ext/src/autogen_ext/teams/magentic_one.py
@ -0,0 +1,140 @@
+import warnings
+from typing import List
+
+from autogen_agentchat.agents import CodeExecutorAgent, UserProxyAgent
+from autogen_agentchat.base import ChatAgent
+from autogen_agentchat.teams import MagenticOneGroupChat
+from autogen_core.models import ChatCompletionClient
+
+from autogen_ext.agents.file_surfer import FileSurfer
+from autogen_ext.agents.magentic_one import MagenticOneCoderAgent
+from autogen_ext.agents.web_surfer import MultimodalWebSurfer
+from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
+from autogen_ext.models.openai._openai_client import BaseOpenAIChatCompletionClient
+
+
+class MagenticOne(MagenticOneGroupChat):
+    """
+    MagenticOne is a specialized group chat class that integrates various agents
+    such as FileSurfer, WebSurfer, Coder, and Executor to solve complex tasks.
+    To read more about the science behind Magentic-One, see the full blog post: `Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks <https://www.microsoft.com/en-us/research/articles/magentic-one-a-generalist-multi-agent-system-for-solving-complex-tasks>`_ and the references below.
+
+    Args:
+        client (ChatCompletionClient): The client used for model interactions.
+        hil_mode (bool): Optional; If set to True, adds the UserProxyAgent to the list of agents.
+
+    .. warning::
+        Using Magentic-One involves interacting with a digital world designed for humans, which carries inherent risks. To minimize these risks, consider the following precautions:
+
+        1. **Use Containers**: Run all tasks in docker containers to isolate the agents and prevent direct system attacks.
+        2. **Virtual Environment**: Use a virtual environment to run the agents and prevent them from accessing sensitive data.
+        3. **Monitor Logs**: Closely monitor logs during and after execution to detect and mitigate risky behavior.
+        4. **Human Oversight**: Run the examples with a human in the loop to supervise the agents and prevent unintended consequences.
+        5. **Limit Access**: Restrict the agents' access to the internet and other resources to prevent unauthorized actions.
+        6. **Safeguard Data**: Ensure that the agents do not have access to sensitive data or resources that could be compromised. Do not share sensitive information with the agents.
+
+        Be aware that agents may occasionally attempt risky actions, such as recruiting humans for help or accepting cookie agreements without human involvement. Always ensure agents are monitored and operate within a controlled environment to prevent unintended consequences. Moreover, be cautious that Magentic-One may be susceptible to prompt injection attacks from webpages.
+
+    Architecture:
+
+    Magentic-One is a generalist multi-agent system for solving open-ended web and file-based tasks across a variety of domains. It represents a significant step towards developing agents that can complete tasks that people encounter in their work and personal lives.
+
+    Magentic-One work is based on a multi-agent architecture where a lead Orchestrator agent is responsible for high-level planning, directing other agents, and tracking task progress. The Orchestrator begins by creating a plan to tackle the task, gathering needed facts and educated guesses in a Task Ledger that is maintained. At each step of its plan, the Orchestrator creates a Progress Ledger where it self-reflects on task progress and checks whether the task is completed. If the task is not yet completed, it assigns one of Magentic-One's other agents a subtask to complete. After the assigned agent completes its subtask, the Orchestrator updates the Progress Ledger and continues in this way until the task is complete. If the Orchestrator finds that progress is not being made for enough steps, it can update the Task Ledger and create a new plan.
+
+    Overall, Magentic-One consists of the following agents:
+
+    - Orchestrator: The lead agent responsible for task decomposition and planning, directing other agents in executing subtasks, tracking overall progress, and taking corrective actions as needed.
+    - WebSurfer: An LLM-based agent proficient in commanding and managing the state of a Chromium-based web browser. It performs actions on the browser and reports on the new state of the web page.
+    - FileSurfer: An LLM-based agent that commands a markdown-based file preview application to read local files of most types. It can also perform common navigation tasks such as listing the contents of directories and navigating a folder structure.
+    - Coder: An LLM-based agent specialized in writing code, analyzing information collected from other agents, or creating new artifacts.
+    - ComputerTerminal: Provides the team with access to a console shell where the Coder’s programs can be executed, and where new programming libraries can be installed.
+
+    Together, Magentic-One’s agents provide the Orchestrator with the tools and capabilities needed to solve a broad variety of open-ended problems, as well as the ability to autonomously adapt to, and act in, dynamic and ever-changing web and file-system environments.
+
+    Examples:
+
+        .. code-block:: python
+
+            # Autonomously complete a coding task:
+            import asyncio
+            from autogen_ext.models.openai import OpenAIChatCompletionClient
+            from autogen_ext.teams.magentic_one import MagenticOne
+            from autogen_agentchat.ui import Console
+
+
+            async def example_usage():
+                client = OpenAIChatCompletionClient(model="gpt-4o")
+                m1 = MagenticOne(client=client)
+                task = "Write a Python script to fetch data from an API."
+                result = await Console(m1.run_stream(task=task))
+                print(result)
+
+
+            if __name__ == "__main__":
+                asyncio.run(example_usage())
+
+
+        .. code-block:: python
+
+            # Enable human-in-the-loop mode
+            import asyncio
+            from autogen_ext.models.openai import OpenAIChatCompletionClient
+            from autogen_ext.teams.magentic_one import MagenticOne
+            from autogen_agentchat.ui import Console
+
+
+            async def example_usage_hil():
+                client = OpenAIChatCompletionClient(model="gpt-4o")
+                # to enable human-in-the-loop mode, set hil_mode=True
+                m1 = MagenticOne(client=client, hil_mode=True)
+                task = "Write a Python script to fetch data from an API."
+                result = await Console(m1.run_stream(task=task))
+                print(result)
+
+
+            if __name__ == "__main__":
+                asyncio.run(example_usage_hil())
+
+    References:
+        .. code-block:: bibtex
+
+            @article{fourney2024magentic,
+                title={Magentic-one: A generalist multi-agent system for solving complex tasks},
+                author={Fourney, Adam and Bansal, Gagan and Mozannar, Hussein and Tan, Cheng and Salinas, Eduardo and Niedtner, Friederike and Proebsting, Grace and Bassman, Griffin and Gerrits, Jack and Alber, Jacob and others},
+                journal={arXiv preprint arXiv:2411.04468},
+                year={2024},
+                url={https://arxiv.org/abs/2411.04468}
+            }
+
+
+    """
+
+    def __init__(self, client: ChatCompletionClient, hil_mode: bool = False):
+        self.client = client
+        self._validate_client_capabilities(client)
+
+        fs = FileSurfer("FileSurfer", model_client=client)
+        ws = MultimodalWebSurfer("WebSurfer", model_client=client)
+        coder = MagenticOneCoderAgent("Coder", model_client=client)
+        executor = CodeExecutorAgent("Executor", code_executor=LocalCommandLineCodeExecutor())
+        agents: List[ChatAgent] = [fs, ws, coder, executor]
+        if hil_mode:
+            user_proxy = UserProxyAgent("User")
+            agents.append(user_proxy)
+        super().__init__(agents, model_client=client)
+
+    def _validate_client_capabilities(self, client: ChatCompletionClient) -> None:
+        capabilities = client.capabilities
+        required_capabilities = ["vision", "function_calling", "json_output"]
+
+        if not all(capabilities.get(cap) for cap in required_capabilities):
+            warnings.warn(
+                "Client capabilities for MagenticOne must include vision, " "function calling, and json output.",
+                stacklevel=2,
+            )
+
+        if not isinstance(client, BaseOpenAIChatCompletionClient):
+            warnings.warn(
+                "MagenticOne performs best with OpenAI GPT-4o model either " "through OpenAI or Azure OpenAI.",
+                stacklevel=2,
+            )
--- a/python/packages/autogen-ext/src/autogen_ext/teams/magentic_one_cli.py
+++ b/python/packages/autogen-ext/src/autogen_ext/teams/magentic_one_cli.py
@ -0,0 +1,46 @@
+import argparse
+import asyncio
+
+from autogen_agentchat.ui import Console
+
+from autogen_ext.models.openai import OpenAIChatCompletionClient
+from autogen_ext.teams.magentic_one import MagenticOne
+
+
+def main() -> None:
+    """
+    Command-line interface for running a complex task using MagenticOne.
+
+    This script accepts a single task string and an optional flag to disable
+    human-in-the-loop mode. It initializes the necessary clients and runs the
+    task using the MagenticOne class.
+
+    Arguments:
+    task (str): The task to be executed by MagenticOne.
+    --no-hil: Optional flag to disable human-in-the-loop mode.
+
+    Example usage:
+    python magentic_one_cli.py "example task"
+    python magentic_one_cli.py --no-hil "example task"
+    """
+    parser = argparse.ArgumentParser(
+        description=(
+            "Run a complex task using MagenticOne.\n\n"
+            "For more information, refer to the following paper: https://arxiv.org/abs/2411.04468"
+        )
+    )
+    parser.add_argument("task", type=str, nargs=1, help="The task to be executed by MagenticOne.")
+    parser.add_argument("--no-hil", action="store_true", help="Disable human-in-the-loop mode.")
+    args = parser.parse_args()
+
+    async def run_task(task: str, hil_mode: bool) -> None:
+        client = OpenAIChatCompletionClient(model="gpt-4o")
+        m1 = MagenticOne(client=client, hil_mode=hil_mode)
+        await Console(m1.run_stream(task=task))
+
+    task = args.task[0]
+    asyncio.run(run_task(task, not args.no_hil))
+
+
+if __name__ == "__main__":
+    main()
--- a/python/packages/autogen-magentic-one/README.md
+++ b/python/packages/autogen-magentic-one/README.md
@ -1,5 +1,8 @@
 # Magentic-One

+> [!IMPORTANT]
+> **Note (December 22nd, 2024):** We recommend using the [Magentic-One API](https://github.com/microsoft/autogen/tree/main/python/packages/autogen-ext/src/autogen_ext/teams/magentic_one.py) as the preferred way to interact with Magentic-One. The API provides a more streamlined and robust interface for integrating Magentic-One into your projects.
+
 > [!CAUTION]
 > Using Magentic-One involves interacting with a digital world designed for humans, which carries inherent risks. To minimize these risks, consider the following precautions:
 >