autogen/test/agentchat/contrib/agent_eval/test_agent_eval.py
James Woffinden-Luey dad9c66104
Agenteval integration (#2672)
* first pass at offline agent eval integration

* Integrating AgentEval for offline scenarios

* removing old changes

* fixing notebook, updating docs

* fixing subcriteria bug

* updating class comment

* cleaning up agent constructors

* moving AgentEval agents to separate folder and adding a brief README

* fixing build breaks

* fixing formatting break

* fixing comments

* consolidating files in the agenteval folder under contrib and cleaning up imports

* fixing import ordering

* adding basic agenteval tests and fixing criteria parsing bug

* first try at adding openai agenteval tests to build process

* adding non-openai agenteval tests to build process

* updating test settings

* updating openai test

* Update test/agentchat/contrib/agent_eval/test_agent_eval.py

Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com>

* Update .github/workflows/contrib-openai.yml

Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com>

* test commit

* updating typing and converting to pydantic objects

* fixing test file

---------

Co-authored-by: Beibin Li <BeibinLi@users.noreply.github.com>
Co-authored-by: Chi Wang <wang.chi@microsoft.com>
Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com>
2024-05-14 07:14:37 +00:00

101 lines
3.3 KiB
Python

#!/usr/bin/env python3 -m pytest
import json
import pytest
from conftest import reason, skip_openai # noqa: E402
import autogen
from autogen.agentchat.contrib.agent_eval.agent_eval import generate_criteria, quantify_criteria
from autogen.agentchat.contrib.agent_eval.criterion import Criterion
from autogen.agentchat.contrib.agent_eval.task import Task
KEY_LOC = "notebook"
OAI_CONFIG_LIST = "OAI_CONFIG_LIST"
def remove_ground_truth(test_case: str):
test_details = json.loads(test_case)
# need to remove the ground truth from the test details
correctness = test_details.pop("is_correct", None)
test_details.pop("correct_ans", None)
test_details.pop("check_result", None)
return str(test_details), correctness
if not skip_openai:
openai_config_list = autogen.config_list_from_json(
OAI_CONFIG_LIST,
file_location=KEY_LOC,
# The Retrieval tool requires at least gpt-3.5-turbo-1106 (newer versions are supported) or gpt-4-turbo-preview models.
# https://platform.openai.com/docs/models/overview
filter_dict={
"api_type": ["openai"],
"model": [
"gpt-4-turbo",
"gpt-4-turbo-preview",
"gpt-4-0125-preview",
"gpt-4-1106-preview",
"gpt-3.5-turbo",
"gpt-3.5-turbo-0125",
"gpt-3.5-turbo-1106",
],
},
)
aoai_config_list = autogen.config_list_from_json(
OAI_CONFIG_LIST,
file_location=KEY_LOC,
filter_dict={"api_type": ["azure"]},
)
success_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_successful.txt", "r").read()
response_successful = remove_ground_truth(success_str)[0]
failed_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_failed.txt", "r").read()
response_failed = remove_ground_truth(failed_str)[0]
task = Task(
**{
"name": "Math problem solving",
"description": "Given any question, the system needs to solve the problem as consisely and accurately as possible",
"successful_response": response_successful,
"failed_response": response_failed,
}
)
@pytest.mark.skipif(
skip_openai,
reason=reason,
)
def test_generate_criteria():
criteria = generate_criteria(task=task, llm_config={"config_list": aoai_config_list})
assert criteria
assert len(criteria) > 0
assert criteria[0].description
assert criteria[0].name
assert criteria[0].accepted_values
@pytest.mark.skipif(
skip_openai,
reason=reason,
)
def test_quantify_criteria():
criteria_file = "test/test_files/agenteval-in-out/samples/sample_math_criteria.json"
criteria = open(criteria_file, "r").read()
criteria = Criterion.parse_json_str(criteria)
test_case = open("test/test_files/agenteval-in-out/samples/sample_test_case.json", "r").read()
test_case, ground_truth = remove_ground_truth(test_case)
quantified = quantify_criteria(
llm_config={"config_list": aoai_config_list},
criteria=criteria,
task=task,
test_case=test_case,
ground_truth=ground_truth,
)
assert quantified
assert quantified["actual_success"]
assert quantified["estimated_performance"]