autogen/test/agentchat/contrib/agent_eval/test_agent_eval.py

#!/usr/bin/env python3 -m pytest

import json

import pytest
from conftest import reason, skip_openai  # noqa: E402

import autogen
from autogen.agentchat.contrib.agent_eval.agent_eval import generate_criteria, quantify_criteria
from autogen.agentchat.contrib.agent_eval.criterion import Criterion
from autogen.agentchat.contrib.agent_eval.task import Task

KEY_LOC = "notebook"
OAI_CONFIG_LIST = "OAI_CONFIG_LIST"


def remove_ground_truth(test_case: str):
    test_details = json.loads(test_case)
    # need to remove the ground truth from the test details
    correctness = test_details.pop("is_correct", None)
    test_details.pop("correct_ans", None)
    test_details.pop("check_result", None)
    return str(test_details), correctness


if not skip_openai:
    openai_config_list = autogen.config_list_from_json(
        OAI_CONFIG_LIST,
        file_location=KEY_LOC,
        # The Retrieval tool requires at least gpt-3.5-turbo-1106 (newer versions are supported) or gpt-4-turbo-preview models.
        # https://platform.openai.com/docs/models/overview
        filter_dict={
            "api_type": ["openai"],
            "model": [
                "gpt-4-turbo",
                "gpt-4-turbo-preview",
                "gpt-4-0125-preview",
                "gpt-4-1106-preview",
                "gpt-3.5-turbo",
                "gpt-3.5-turbo-0125",
                "gpt-3.5-turbo-1106",
            ],
        },
    )

    aoai_config_list = autogen.config_list_from_json(
        OAI_CONFIG_LIST,
        file_location=KEY_LOC,
        filter_dict={"api_type": ["azure"]},
    )

    success_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_successful.txt", "r").read()
    response_successful = remove_ground_truth(success_str)[0]
    failed_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_failed.txt", "r").read()
    response_failed = remove_ground_truth(failed_str)[0]
    task = Task(
        **{
            "name": "Math problem solving",
            "description": "Given any question, the system needs to solve the problem as consisely and accurately as possible",
            "successful_response": response_successful,
            "failed_response": response_failed,
        }
    )


@pytest.mark.skipif(
    skip_openai,
    reason=reason,
)
def test_generate_criteria():
    criteria = generate_criteria(task=task, llm_config={"config_list": aoai_config_list})
    assert criteria
    assert len(criteria) > 0
    assert criteria[0].description
    assert criteria[0].name
    assert criteria[0].accepted_values


@pytest.mark.skipif(
    skip_openai,
    reason=reason,
)
def test_quantify_criteria():
    criteria_file = "test/test_files/agenteval-in-out/samples/sample_math_criteria.json"
    criteria = open(criteria_file, "r").read()
    criteria = Criterion.parse_json_str(criteria)

    test_case = open("test/test_files/agenteval-in-out/samples/sample_test_case.json", "r").read()
    test_case, ground_truth = remove_ground_truth(test_case)

    quantified = quantify_criteria(
        llm_config={"config_list": aoai_config_list},
        criteria=criteria,
        task=task,
        test_case=test_case,
        ground_truth=ground_truth,
    )
    assert quantified
    assert quantified["actual_success"]
    assert quantified["estimated_performance"]
Agenteval integration (#2672) * first pass at offline agent eval integration * Integrating AgentEval for offline scenarios * removing old changes * fixing notebook, updating docs * fixing subcriteria bug * updating class comment * cleaning up agent constructors * moving AgentEval agents to separate folder and adding a brief README * fixing build breaks * fixing formatting break * fixing comments * consolidating files in the agenteval folder under contrib and cleaning up imports * fixing import ordering * adding basic agenteval tests and fixing criteria parsing bug * first try at adding openai agenteval tests to build process * adding non-openai agenteval tests to build process * updating test settings * updating openai test * Update test/agentchat/contrib/agent_eval/test_agent_eval.py Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com> * Update .github/workflows/contrib-openai.yml Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com> * test commit * updating typing and converting to pydantic objects * fixing test file --------- Co-authored-by: Beibin Li <BeibinLi@users.noreply.github.com> Co-authored-by: Chi Wang <wang.chi@microsoft.com> Co-authored-by: Wael Karkoub <wael.karkoub96@gmail.com> 2024-05-14 00:14:37 -07:00			`#!/usr/bin/env python3 -m pytest`

			`import json`

			`import pytest`
			`from conftest import reason, skip_openai # noqa: E402`

			`import autogen`
			`from autogen.agentchat.contrib.agent_eval.agent_eval import generate_criteria, quantify_criteria`
			`from autogen.agentchat.contrib.agent_eval.criterion import Criterion`
			`from autogen.agentchat.contrib.agent_eval.task import Task`

			`KEY_LOC = "notebook"`
			`OAI_CONFIG_LIST = "OAI_CONFIG_LIST"`


			`def remove_ground_truth(test_case: str):`
			`test_details = json.loads(test_case)`
			`# need to remove the ground truth from the test details`
			`correctness = test_details.pop("is_correct", None)`
			`test_details.pop("correct_ans", None)`
			`test_details.pop("check_result", None)`
			`return str(test_details), correctness`


			`if not skip_openai:`
			`openai_config_list = autogen.config_list_from_json(`
			`OAI_CONFIG_LIST,`
			`file_location=KEY_LOC,`
			`# The Retrieval tool requires at least gpt-3.5-turbo-1106 (newer versions are supported) or gpt-4-turbo-preview models.`
			`# https://platform.openai.com/docs/models/overview`
			`filter_dict={`
			`"api_type": ["openai"],`
			`"model": [`
			`"gpt-4-turbo",`
			`"gpt-4-turbo-preview",`
			`"gpt-4-0125-preview",`
			`"gpt-4-1106-preview",`
			`"gpt-3.5-turbo",`
			`"gpt-3.5-turbo-0125",`
			`"gpt-3.5-turbo-1106",`
			`],`
			`},`
			`)`

			`aoai_config_list = autogen.config_list_from_json(`
			`OAI_CONFIG_LIST,`
			`file_location=KEY_LOC,`
			`filter_dict={"api_type": ["azure"]},`
			`)`

			`success_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_successful.txt", "r").read()`
			`response_successful = remove_ground_truth(success_str)[0]`
			`failed_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_failed.txt", "r").read()`
			`response_failed = remove_ground_truth(failed_str)[0]`
			`task = Task(`
			`**{`
			`"name": "Math problem solving",`
			`"description": "Given any question, the system needs to solve the problem as consisely and accurately as possible",`
			`"successful_response": response_successful,`
			`"failed_response": response_failed,`
			`}`
			`)`


			`@pytest.mark.skipif(`
			`skip_openai,`
			`reason=reason,`
			`)`
			`def test_generate_criteria():`
			`criteria = generate_criteria(task=task, llm_config={"config_list": aoai_config_list})`
			`assert criteria`
			`assert len(criteria) > 0`
			`assert criteria[0].description`
			`assert criteria[0].name`
			`assert criteria[0].accepted_values`


			`@pytest.mark.skipif(`
			`skip_openai,`
			`reason=reason,`
			`)`
			`def test_quantify_criteria():`
			`criteria_file = "test/test_files/agenteval-in-out/samples/sample_math_criteria.json"`
			`criteria = open(criteria_file, "r").read()`
			`criteria = Criterion.parse_json_str(criteria)`

			`test_case = open("test/test_files/agenteval-in-out/samples/sample_test_case.json", "r").read()`
			`test_case, ground_truth = remove_ground_truth(test_case)`

			`quantified = quantify_criteria(`
			`llm_config={"config_list": aoai_config_list},`
			`criteria=criteria,`
			`task=task,`
			`test_case=test_case,`
			`ground_truth=ground_truth,`
			`)`
			`assert quantified`
			`assert quantified["actual_success"]`
			`assert quantified["estimated_performance"]`