mirror of
https://github.com/microsoft/autogen.git
synced 2025-07-03 23:19:33 +00:00

## Why are these changes needed? This PR introduces a baseline self-debugging loop to the `CodeExecutionAgent`. The loop automatically retries code generation and execution up to a configurable number of attempts (n) until the execution succeeds or the retry limit is reached. This enables the agent to recover from transient failures (e.g., syntax errors, runtime errors) by using its own reasoning to iteratively improve generated code—laying the foundation for more robust autonomous behavior. ## Related issue number Closes #6207 ## Checks - [x] I've included any doc changes needed for <https://microsoft.github.io/autogen/>. See <https://github.com/microsoft/autogen/blob/main/CONTRIBUTING.md> to build and test documentation locally. - [x] I've added tests (if relevant) corresponding to the changes introduced in this PR. - [x] I've made sure all auto checks have passed. --------- Signed-off-by: Abhijeetsingh Meena <abhijeet040403@gmail.com> Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com>
433 lines
15 KiB
Python
433 lines
15 KiB
Python
import pytest
|
|
from autogen_agentchat.agents import CodeExecutorAgent
|
|
from autogen_agentchat.base import Response
|
|
from autogen_agentchat.messages import (
|
|
CodeExecutionEvent,
|
|
CodeGenerationEvent,
|
|
TextMessage,
|
|
)
|
|
from autogen_core import CancellationToken
|
|
from autogen_core.models import ModelFamily, ModelInfo
|
|
from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
|
|
from autogen_ext.models.replay import ReplayChatCompletionClient
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_basic_code_execution() -> None:
|
|
"""Test basic code execution"""
|
|
|
|
agent = CodeExecutorAgent(name="code_executor", code_executor=LocalCommandLineCodeExecutor())
|
|
|
|
messages = [
|
|
TextMessage(
|
|
content="""
|
|
```python
|
|
import math
|
|
|
|
number = 42
|
|
square_root = math.sqrt(number)
|
|
print("%0.3f" % (square_root,))
|
|
```
|
|
""".strip(),
|
|
source="assistant",
|
|
)
|
|
]
|
|
response = await agent.on_messages(messages, CancellationToken())
|
|
|
|
assert isinstance(response, Response)
|
|
assert isinstance(response.chat_message, TextMessage)
|
|
assert response.chat_message.content.strip() == "6.481"
|
|
assert response.chat_message.source == "code_executor"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_code_generation_and_execution_with_model_client() -> None:
|
|
"""
|
|
Tests the code generation, execution and reflection pipeline using a model client.
|
|
"""
|
|
|
|
language = "python"
|
|
code = 'import math\n\nnumber = 42\nsquare_root = math.sqrt(number)\nprint("%0.3f" % (square_root,))'
|
|
|
|
model_client = ReplayChatCompletionClient(
|
|
[f"Here is the code to calculate the square root of 42:\n```{language}\n{code}```".strip(), "TERMINATE"]
|
|
)
|
|
|
|
agent = CodeExecutorAgent(
|
|
name="code_executor_agent", code_executor=LocalCommandLineCodeExecutor(), model_client=model_client
|
|
)
|
|
|
|
messages = [
|
|
TextMessage(
|
|
content="Generate python code to calculate the square root of 42",
|
|
source="assistant",
|
|
)
|
|
]
|
|
|
|
code_generation_event: CodeGenerationEvent | None = None
|
|
code_execution_event: CodeExecutionEvent | None = None
|
|
response: Response | None = None
|
|
|
|
async for message in agent.on_messages_stream(messages, CancellationToken()):
|
|
if isinstance(message, CodeGenerationEvent):
|
|
code_block = message.code_blocks[0]
|
|
assert code_block.code == code, "Code block does not match"
|
|
assert code_block.language == language, "Language does not match"
|
|
code_generation_event = message
|
|
elif isinstance(message, CodeExecutionEvent):
|
|
assert message.to_text().strip() == "6.481", f"Expected '6.481', got: {message.to_text().strip()}"
|
|
code_execution_event = message
|
|
elif isinstance(message, Response):
|
|
assert isinstance(
|
|
message.chat_message, TextMessage
|
|
), f"Expected TextMessage, got: {type(message.chat_message)}"
|
|
assert (
|
|
message.chat_message.source == "code_executor_agent"
|
|
), f"Expected source 'code_executor_agent', got: {message.chat_message.source}"
|
|
response = message
|
|
else:
|
|
raise AssertionError(f"Unexpected message type: {type(message)}")
|
|
|
|
assert code_generation_event is not None, "Code generation event was not received"
|
|
assert code_execution_event is not None, "Code execution event was not received"
|
|
assert response is not None, "Response was not received"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_no_code_response_with_model_client() -> None:
|
|
"""
|
|
Tests agent behavior when the model client responds with non-code content.
|
|
"""
|
|
|
|
model_client = ReplayChatCompletionClient(["The capital of France is Paris.", "TERMINATE"])
|
|
|
|
agent = CodeExecutorAgent(
|
|
name="code_executor_agent", code_executor=LocalCommandLineCodeExecutor(), model_client=model_client
|
|
)
|
|
|
|
messages = [
|
|
TextMessage(
|
|
content="What is the capital of France?",
|
|
source="assistant",
|
|
)
|
|
]
|
|
|
|
response: Response | None = None
|
|
|
|
async for message in agent.on_messages_stream(messages, CancellationToken()):
|
|
if isinstance(message, Response):
|
|
assert isinstance(
|
|
message.chat_message, TextMessage
|
|
), f"Expected TextMessage, got: {type(message.chat_message)}"
|
|
assert (
|
|
message.chat_message.source == "code_executor_agent"
|
|
), f"Expected source 'code_executor_agent', got: {message.chat_message.source}"
|
|
assert (
|
|
message.chat_message.content.strip() == "The capital of France is Paris."
|
|
), f"Expected 'The capital of France is Paris.', got: {message.chat_message.content.strip()}"
|
|
response = message
|
|
else:
|
|
raise AssertionError(f"Unexpected message type: {type(message)}")
|
|
|
|
assert response is not None, "Response was not received"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_self_debugging_loop() -> None:
|
|
"""
|
|
Tests self debugging loop when the model client responds with incorrect code.
|
|
"""
|
|
language = "python"
|
|
incorrect_code_block = """
|
|
numbers = [10, 20, 30, 40, 50]
|
|
mean = sum(numbers) / len(numbers
|
|
print("The mean is:", mean)
|
|
""".strip()
|
|
incorrect_code_result = """
|
|
mean = sum(numbers) / len(numbers
|
|
^
|
|
SyntaxError: '(' was never closed
|
|
""".strip()
|
|
correct_code_block = """
|
|
numbers = [10, 20, 30, 40, 50]
|
|
mean = sum(numbers) / len(numbers)
|
|
print("The mean is:", mean)
|
|
""".strip()
|
|
correct_code_result = """
|
|
The mean is: 30.0
|
|
""".strip()
|
|
|
|
model_client = ReplayChatCompletionClient(
|
|
[
|
|
f"""
|
|
Here is the code to calculate the mean of 10, 20, 30, 40, 50
|
|
|
|
```{language}
|
|
{incorrect_code_block}
|
|
```
|
|
""",
|
|
"""{"retry": "true", "reason": "Retry 1: It is a test environment"}""",
|
|
f"""
|
|
Here is the updated code to calculate the mean of 10, 20, 30, 40, 50
|
|
|
|
```{language}
|
|
{correct_code_block}
|
|
```""",
|
|
"Final Response",
|
|
"TERMINATE",
|
|
],
|
|
model_info=ModelInfo(
|
|
vision=False,
|
|
function_calling=False,
|
|
json_output=True,
|
|
family=ModelFamily.UNKNOWN,
|
|
structured_output=True,
|
|
),
|
|
)
|
|
|
|
agent = CodeExecutorAgent(
|
|
name="code_executor_agent",
|
|
code_executor=LocalCommandLineCodeExecutor(),
|
|
model_client=model_client,
|
|
max_retries_on_error=1,
|
|
)
|
|
|
|
messages = [
|
|
TextMessage(
|
|
content="Calculate the mean of 10, 20, 30, 40, 50.",
|
|
source="assistant",
|
|
)
|
|
]
|
|
|
|
incorrect_code_generation_event: CodeGenerationEvent | None = None
|
|
correct_code_generation_event: CodeGenerationEvent | None = None
|
|
retry_decision_event: CodeGenerationEvent | None = None
|
|
incorrect_code_execution_event: CodeExecutionEvent | None = None
|
|
correct_code_execution_event: CodeExecutionEvent | None = None
|
|
response: Response | None = None
|
|
|
|
message_id: int = 0
|
|
async for message in agent.on_messages_stream(messages, CancellationToken()):
|
|
if isinstance(message, CodeGenerationEvent) and message_id == 0:
|
|
# Step 1: First code generation
|
|
code_block = message.code_blocks[0]
|
|
assert code_block.code.strip() == incorrect_code_block, "Incorrect code block does not match"
|
|
assert code_block.language == language, "Language does not match"
|
|
incorrect_code_generation_event = message
|
|
|
|
elif isinstance(message, CodeExecutionEvent) and message_id == 1:
|
|
# Step 2: First code execution
|
|
assert (
|
|
incorrect_code_result in message.to_text().strip()
|
|
), f"Expected {incorrect_code_result} in execution result, got: {message.to_text().strip()}"
|
|
incorrect_code_execution_event = message
|
|
|
|
elif isinstance(message, CodeGenerationEvent) and message_id == 2:
|
|
# Step 3: Retry generation with proposed correction
|
|
retry_response = "Attempt number: 1\nProposed correction: Retry 1: It is a test environment"
|
|
assert (
|
|
message.to_text().strip() == retry_response
|
|
), f"Expected {retry_response}, got: {message.to_text().strip()}"
|
|
retry_decision_event = message
|
|
|
|
elif isinstance(message, CodeGenerationEvent) and message_id == 3:
|
|
# Step 4: Second retry code generation
|
|
code_block = message.code_blocks[0]
|
|
assert code_block.code.strip() == correct_code_block, "Correct code block does not match"
|
|
assert code_block.language == language, "Language does not match"
|
|
correct_code_generation_event = message
|
|
|
|
elif isinstance(message, CodeExecutionEvent) and message_id == 4:
|
|
# Step 5: Second retry code execution
|
|
assert (
|
|
message.to_text().strip() == correct_code_result
|
|
), f"Expected {correct_code_result} in execution result, got: {message.to_text().strip()}"
|
|
correct_code_execution_event = message
|
|
|
|
elif isinstance(message, Response) and message_id == 5:
|
|
# Step 6: Final response
|
|
assert isinstance(
|
|
message.chat_message, TextMessage
|
|
), f"Expected TextMessage, got: {type(message.chat_message)}"
|
|
assert (
|
|
message.chat_message.source == "code_executor_agent"
|
|
), f"Expected source 'code_executor_agent', got: {message.chat_message.source}"
|
|
response = message
|
|
|
|
else:
|
|
raise AssertionError(f"Unexpected message type: {type(message)}")
|
|
|
|
message_id += 1
|
|
|
|
assert incorrect_code_generation_event is not None, "Incorrect code generation event was not received"
|
|
assert incorrect_code_execution_event is not None, "Incorrect code execution event was not received"
|
|
assert retry_decision_event is not None, "Retry decision event was not received"
|
|
assert correct_code_generation_event is not None, "Correct code generation event was not received"
|
|
assert correct_code_execution_event is not None, "Correct code execution event was not received"
|
|
assert response is not None, "Response was not received"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_code_execution_error() -> None:
|
|
"""Test basic code execution"""
|
|
|
|
agent = CodeExecutorAgent(name="code_executor", code_executor=LocalCommandLineCodeExecutor())
|
|
|
|
messages = [
|
|
TextMessage(
|
|
content="""
|
|
```python
|
|
import math
|
|
|
|
number = -1.0
|
|
square_root = math.sqrt(number)
|
|
print("%0.3f" % (square_root,))
|
|
```
|
|
""".strip(),
|
|
source="assistant",
|
|
)
|
|
]
|
|
response = await agent.on_messages(messages, CancellationToken())
|
|
|
|
assert isinstance(response, Response)
|
|
assert isinstance(response.chat_message, TextMessage)
|
|
assert "The script ran, then exited with an error (POSIX exit code: 1)" in response.chat_message.content
|
|
assert "ValueError: math domain error" in response.chat_message.content
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_code_execution_no_output() -> None:
|
|
"""Test basic code execution"""
|
|
|
|
agent = CodeExecutorAgent(name="code_executor", code_executor=LocalCommandLineCodeExecutor())
|
|
|
|
messages = [
|
|
TextMessage(
|
|
content="""
|
|
```python
|
|
import math
|
|
|
|
number = 42
|
|
square_root = math.sqrt(number)
|
|
```
|
|
""".strip(),
|
|
source="assistant",
|
|
)
|
|
]
|
|
response = await agent.on_messages(messages, CancellationToken())
|
|
|
|
assert isinstance(response, Response)
|
|
assert isinstance(response.chat_message, TextMessage)
|
|
assert (
|
|
"The script ran but produced no output to console. The POSIX exit code was: 0. If you were expecting output, consider revising the script to ensure content is printed to stdout."
|
|
in response.chat_message.content
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_code_execution_no_block() -> None:
|
|
"""Test basic code execution"""
|
|
|
|
agent = CodeExecutorAgent(name="code_executor", code_executor=LocalCommandLineCodeExecutor())
|
|
|
|
messages = [
|
|
TextMessage(
|
|
content="""
|
|
import math
|
|
|
|
number = 42
|
|
square_root = math.sqrt(number)
|
|
""".strip(),
|
|
source="assistant",
|
|
)
|
|
]
|
|
response = await agent.on_messages(messages, CancellationToken())
|
|
|
|
assert isinstance(response, Response)
|
|
assert isinstance(response.chat_message, TextMessage)
|
|
assert (
|
|
"No code blocks found in the thread. Please provide at least one markdown-encoded code block"
|
|
in response.chat_message.content
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_code_execution_multiple_blocks() -> None:
|
|
"""Test basic code execution"""
|
|
|
|
agent = CodeExecutorAgent(name="code_executor", code_executor=LocalCommandLineCodeExecutor())
|
|
|
|
messages = [
|
|
TextMessage(
|
|
content="""
|
|
```python
|
|
import math
|
|
|
|
number = 42
|
|
square_root = math.sqrt(number)
|
|
print("%0.3f" % (square_root,))
|
|
```
|
|
|
|
And also:
|
|
|
|
```python
|
|
import time
|
|
print(f"The current time is: {time.time()}")
|
|
|
|
```
|
|
|
|
And this should result in an error:
|
|
```python
|
|
import math
|
|
|
|
number = -1.0
|
|
square_root = math.sqrt(number)
|
|
print("%0.3f" % (square_root,))
|
|
```
|
|
|
|
""".strip(),
|
|
source="assistant",
|
|
)
|
|
]
|
|
response = await agent.on_messages(messages, CancellationToken())
|
|
|
|
assert isinstance(response, Response)
|
|
assert isinstance(response.chat_message, TextMessage)
|
|
assert "6.481" in response.chat_message.content
|
|
assert "The current time is:" in response.chat_message.content
|
|
assert "The script ran, then exited with an error (POSIX exit code: 1)" in response.chat_message.content
|
|
assert "ValueError: math domain error" in response.chat_message.content
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_code_execution_agent_serialization() -> None:
|
|
"""Test agent config serialization"""
|
|
|
|
agent = CodeExecutorAgent(name="code_executor", code_executor=LocalCommandLineCodeExecutor())
|
|
|
|
# Serialize and deserialize the agent
|
|
serialized_agent = agent.dump_component()
|
|
deserialized_agent = CodeExecutorAgent.load_component(serialized_agent)
|
|
|
|
assert isinstance(deserialized_agent, CodeExecutorAgent)
|
|
assert deserialized_agent.name == "code_executor"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_code_execution_agent_serialization_with_model_client() -> None:
|
|
"""Test agent config serialization"""
|
|
|
|
model_client = ReplayChatCompletionClient(["The capital of France is Paris.", "TERMINATE"])
|
|
|
|
agent = CodeExecutorAgent(
|
|
name="code_executor_agent", code_executor=LocalCommandLineCodeExecutor(), model_client=model_client
|
|
)
|
|
|
|
# Serialize and deserialize the agent
|
|
serialized_agent = agent.dump_component()
|
|
deserialized_agent = CodeExecutorAgent.load_component(serialized_agent)
|
|
|
|
assert isinstance(deserialized_agent, CodeExecutorAgent)
|
|
assert deserialized_agent.name == "code_executor_agent"
|
|
assert deserialized_agent._model_client is not None # type: ignore
|