autogen/python/packages/autogen-agentchat/tests/test_code_executor_agent.py
Abhijeetsingh Meena aad6caa768
Add self-debugging loop to CodeExecutionAgent (#6306)
## Why are these changes needed?
This PR introduces a baseline self-debugging loop to the
`CodeExecutionAgent`.

The loop automatically retries code generation and execution up to a
configurable number of attempts (n) until the execution succeeds or the
retry limit is reached.

This enables the agent to recover from transient failures (e.g., syntax
errors, runtime errors) by using its own reasoning to iteratively
improve generated code—laying the foundation for more robust autonomous
behavior.

## Related issue number

Closes #6207

## Checks

- [x] I've included any doc changes needed for
<https://microsoft.github.io/autogen/>. See
<https://github.com/microsoft/autogen/blob/main/CONTRIBUTING.md> to
build and test documentation locally.
- [x] I've added tests (if relevant) corresponding to the changes
introduced in this PR.
- [x] I've made sure all auto checks have passed.

---------

Signed-off-by: Abhijeetsingh Meena <abhijeet040403@gmail.com>
Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com>
2025-04-22 09:24:05 -07:00

433 lines
15 KiB
Python

import pytest
from autogen_agentchat.agents import CodeExecutorAgent
from autogen_agentchat.base import Response
from autogen_agentchat.messages import (
CodeExecutionEvent,
CodeGenerationEvent,
TextMessage,
)
from autogen_core import CancellationToken
from autogen_core.models import ModelFamily, ModelInfo
from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
from autogen_ext.models.replay import ReplayChatCompletionClient
@pytest.mark.asyncio
async def test_basic_code_execution() -> None:
"""Test basic code execution"""
agent = CodeExecutorAgent(name="code_executor", code_executor=LocalCommandLineCodeExecutor())
messages = [
TextMessage(
content="""
```python
import math
number = 42
square_root = math.sqrt(number)
print("%0.3f" % (square_root,))
```
""".strip(),
source="assistant",
)
]
response = await agent.on_messages(messages, CancellationToken())
assert isinstance(response, Response)
assert isinstance(response.chat_message, TextMessage)
assert response.chat_message.content.strip() == "6.481"
assert response.chat_message.source == "code_executor"
@pytest.mark.asyncio
async def test_code_generation_and_execution_with_model_client() -> None:
"""
Tests the code generation, execution and reflection pipeline using a model client.
"""
language = "python"
code = 'import math\n\nnumber = 42\nsquare_root = math.sqrt(number)\nprint("%0.3f" % (square_root,))'
model_client = ReplayChatCompletionClient(
[f"Here is the code to calculate the square root of 42:\n```{language}\n{code}```".strip(), "TERMINATE"]
)
agent = CodeExecutorAgent(
name="code_executor_agent", code_executor=LocalCommandLineCodeExecutor(), model_client=model_client
)
messages = [
TextMessage(
content="Generate python code to calculate the square root of 42",
source="assistant",
)
]
code_generation_event: CodeGenerationEvent | None = None
code_execution_event: CodeExecutionEvent | None = None
response: Response | None = None
async for message in agent.on_messages_stream(messages, CancellationToken()):
if isinstance(message, CodeGenerationEvent):
code_block = message.code_blocks[0]
assert code_block.code == code, "Code block does not match"
assert code_block.language == language, "Language does not match"
code_generation_event = message
elif isinstance(message, CodeExecutionEvent):
assert message.to_text().strip() == "6.481", f"Expected '6.481', got: {message.to_text().strip()}"
code_execution_event = message
elif isinstance(message, Response):
assert isinstance(
message.chat_message, TextMessage
), f"Expected TextMessage, got: {type(message.chat_message)}"
assert (
message.chat_message.source == "code_executor_agent"
), f"Expected source 'code_executor_agent', got: {message.chat_message.source}"
response = message
else:
raise AssertionError(f"Unexpected message type: {type(message)}")
assert code_generation_event is not None, "Code generation event was not received"
assert code_execution_event is not None, "Code execution event was not received"
assert response is not None, "Response was not received"
@pytest.mark.asyncio
async def test_no_code_response_with_model_client() -> None:
"""
Tests agent behavior when the model client responds with non-code content.
"""
model_client = ReplayChatCompletionClient(["The capital of France is Paris.", "TERMINATE"])
agent = CodeExecutorAgent(
name="code_executor_agent", code_executor=LocalCommandLineCodeExecutor(), model_client=model_client
)
messages = [
TextMessage(
content="What is the capital of France?",
source="assistant",
)
]
response: Response | None = None
async for message in agent.on_messages_stream(messages, CancellationToken()):
if isinstance(message, Response):
assert isinstance(
message.chat_message, TextMessage
), f"Expected TextMessage, got: {type(message.chat_message)}"
assert (
message.chat_message.source == "code_executor_agent"
), f"Expected source 'code_executor_agent', got: {message.chat_message.source}"
assert (
message.chat_message.content.strip() == "The capital of France is Paris."
), f"Expected 'The capital of France is Paris.', got: {message.chat_message.content.strip()}"
response = message
else:
raise AssertionError(f"Unexpected message type: {type(message)}")
assert response is not None, "Response was not received"
@pytest.mark.asyncio
async def test_self_debugging_loop() -> None:
"""
Tests self debugging loop when the model client responds with incorrect code.
"""
language = "python"
incorrect_code_block = """
numbers = [10, 20, 30, 40, 50]
mean = sum(numbers) / len(numbers
print("The mean is:", mean)
""".strip()
incorrect_code_result = """
mean = sum(numbers) / len(numbers
^
SyntaxError: '(' was never closed
""".strip()
correct_code_block = """
numbers = [10, 20, 30, 40, 50]
mean = sum(numbers) / len(numbers)
print("The mean is:", mean)
""".strip()
correct_code_result = """
The mean is: 30.0
""".strip()
model_client = ReplayChatCompletionClient(
[
f"""
Here is the code to calculate the mean of 10, 20, 30, 40, 50
```{language}
{incorrect_code_block}
```
""",
"""{"retry": "true", "reason": "Retry 1: It is a test environment"}""",
f"""
Here is the updated code to calculate the mean of 10, 20, 30, 40, 50
```{language}
{correct_code_block}
```""",
"Final Response",
"TERMINATE",
],
model_info=ModelInfo(
vision=False,
function_calling=False,
json_output=True,
family=ModelFamily.UNKNOWN,
structured_output=True,
),
)
agent = CodeExecutorAgent(
name="code_executor_agent",
code_executor=LocalCommandLineCodeExecutor(),
model_client=model_client,
max_retries_on_error=1,
)
messages = [
TextMessage(
content="Calculate the mean of 10, 20, 30, 40, 50.",
source="assistant",
)
]
incorrect_code_generation_event: CodeGenerationEvent | None = None
correct_code_generation_event: CodeGenerationEvent | None = None
retry_decision_event: CodeGenerationEvent | None = None
incorrect_code_execution_event: CodeExecutionEvent | None = None
correct_code_execution_event: CodeExecutionEvent | None = None
response: Response | None = None
message_id: int = 0
async for message in agent.on_messages_stream(messages, CancellationToken()):
if isinstance(message, CodeGenerationEvent) and message_id == 0:
# Step 1: First code generation
code_block = message.code_blocks[0]
assert code_block.code.strip() == incorrect_code_block, "Incorrect code block does not match"
assert code_block.language == language, "Language does not match"
incorrect_code_generation_event = message
elif isinstance(message, CodeExecutionEvent) and message_id == 1:
# Step 2: First code execution
assert (
incorrect_code_result in message.to_text().strip()
), f"Expected {incorrect_code_result} in execution result, got: {message.to_text().strip()}"
incorrect_code_execution_event = message
elif isinstance(message, CodeGenerationEvent) and message_id == 2:
# Step 3: Retry generation with proposed correction
retry_response = "Attempt number: 1\nProposed correction: Retry 1: It is a test environment"
assert (
message.to_text().strip() == retry_response
), f"Expected {retry_response}, got: {message.to_text().strip()}"
retry_decision_event = message
elif isinstance(message, CodeGenerationEvent) and message_id == 3:
# Step 4: Second retry code generation
code_block = message.code_blocks[0]
assert code_block.code.strip() == correct_code_block, "Correct code block does not match"
assert code_block.language == language, "Language does not match"
correct_code_generation_event = message
elif isinstance(message, CodeExecutionEvent) and message_id == 4:
# Step 5: Second retry code execution
assert (
message.to_text().strip() == correct_code_result
), f"Expected {correct_code_result} in execution result, got: {message.to_text().strip()}"
correct_code_execution_event = message
elif isinstance(message, Response) and message_id == 5:
# Step 6: Final response
assert isinstance(
message.chat_message, TextMessage
), f"Expected TextMessage, got: {type(message.chat_message)}"
assert (
message.chat_message.source == "code_executor_agent"
), f"Expected source 'code_executor_agent', got: {message.chat_message.source}"
response = message
else:
raise AssertionError(f"Unexpected message type: {type(message)}")
message_id += 1
assert incorrect_code_generation_event is not None, "Incorrect code generation event was not received"
assert incorrect_code_execution_event is not None, "Incorrect code execution event was not received"
assert retry_decision_event is not None, "Retry decision event was not received"
assert correct_code_generation_event is not None, "Correct code generation event was not received"
assert correct_code_execution_event is not None, "Correct code execution event was not received"
assert response is not None, "Response was not received"
@pytest.mark.asyncio
async def test_code_execution_error() -> None:
"""Test basic code execution"""
agent = CodeExecutorAgent(name="code_executor", code_executor=LocalCommandLineCodeExecutor())
messages = [
TextMessage(
content="""
```python
import math
number = -1.0
square_root = math.sqrt(number)
print("%0.3f" % (square_root,))
```
""".strip(),
source="assistant",
)
]
response = await agent.on_messages(messages, CancellationToken())
assert isinstance(response, Response)
assert isinstance(response.chat_message, TextMessage)
assert "The script ran, then exited with an error (POSIX exit code: 1)" in response.chat_message.content
assert "ValueError: math domain error" in response.chat_message.content
@pytest.mark.asyncio
async def test_code_execution_no_output() -> None:
"""Test basic code execution"""
agent = CodeExecutorAgent(name="code_executor", code_executor=LocalCommandLineCodeExecutor())
messages = [
TextMessage(
content="""
```python
import math
number = 42
square_root = math.sqrt(number)
```
""".strip(),
source="assistant",
)
]
response = await agent.on_messages(messages, CancellationToken())
assert isinstance(response, Response)
assert isinstance(response.chat_message, TextMessage)
assert (
"The script ran but produced no output to console. The POSIX exit code was: 0. If you were expecting output, consider revising the script to ensure content is printed to stdout."
in response.chat_message.content
)
@pytest.mark.asyncio
async def test_code_execution_no_block() -> None:
"""Test basic code execution"""
agent = CodeExecutorAgent(name="code_executor", code_executor=LocalCommandLineCodeExecutor())
messages = [
TextMessage(
content="""
import math
number = 42
square_root = math.sqrt(number)
""".strip(),
source="assistant",
)
]
response = await agent.on_messages(messages, CancellationToken())
assert isinstance(response, Response)
assert isinstance(response.chat_message, TextMessage)
assert (
"No code blocks found in the thread. Please provide at least one markdown-encoded code block"
in response.chat_message.content
)
@pytest.mark.asyncio
async def test_code_execution_multiple_blocks() -> None:
"""Test basic code execution"""
agent = CodeExecutorAgent(name="code_executor", code_executor=LocalCommandLineCodeExecutor())
messages = [
TextMessage(
content="""
```python
import math
number = 42
square_root = math.sqrt(number)
print("%0.3f" % (square_root,))
```
And also:
```python
import time
print(f"The current time is: {time.time()}")
```
And this should result in an error:
```python
import math
number = -1.0
square_root = math.sqrt(number)
print("%0.3f" % (square_root,))
```
""".strip(),
source="assistant",
)
]
response = await agent.on_messages(messages, CancellationToken())
assert isinstance(response, Response)
assert isinstance(response.chat_message, TextMessage)
assert "6.481" in response.chat_message.content
assert "The current time is:" in response.chat_message.content
assert "The script ran, then exited with an error (POSIX exit code: 1)" in response.chat_message.content
assert "ValueError: math domain error" in response.chat_message.content
@pytest.mark.asyncio
async def test_code_execution_agent_serialization() -> None:
"""Test agent config serialization"""
agent = CodeExecutorAgent(name="code_executor", code_executor=LocalCommandLineCodeExecutor())
# Serialize and deserialize the agent
serialized_agent = agent.dump_component()
deserialized_agent = CodeExecutorAgent.load_component(serialized_agent)
assert isinstance(deserialized_agent, CodeExecutorAgent)
assert deserialized_agent.name == "code_executor"
@pytest.mark.asyncio
async def test_code_execution_agent_serialization_with_model_client() -> None:
"""Test agent config serialization"""
model_client = ReplayChatCompletionClient(["The capital of France is Paris.", "TERMINATE"])
agent = CodeExecutorAgent(
name="code_executor_agent", code_executor=LocalCommandLineCodeExecutor(), model_client=model_client
)
# Serialize and deserialize the agent
serialized_agent = agent.dump_component()
deserialized_agent = CodeExecutorAgent.load_component(serialized_agent)
assert isinstance(deserialized_agent, CodeExecutorAgent)
assert deserialized_agent.name == "code_executor_agent"
assert deserialized_agent._model_client is not None # type: ignore