mirror of
https://github.com/microsoft/autogen.git
synced 2025-09-09 08:16:09 +00:00
Adding first version of AgentEval -- a framework for assessing task utility for LLM-powered applications (#681)
* add agenteval-notebook for math problems and the blog post about it * update gitignore * updates to notebook * adding folder for the logs * adding math problems logs * adding folder for alfworld logs * added limitiation and future work to blog post * minor edits blog post * adding changes * reorg * modify the main notebook * modification of the main notebook * remove wrong notebook * uploading new notebook * update agenteval notebook * change the sample * Update agenteval_cq_math.ipynb * adding final changes to notebook * updated framework picture * Update index.mdx * Update index.md * Add files via upload * updates to notebool * revise the blog * revise the blog * update the agent img * revise the blog * revise the blog * Excluded model logs from the main branch, you can find them in agenteval branch * Fixed pre-commit formatting. * Update website/blog/2023-11-11-AgentEval/index.mdx Co-authored-by: Chi Wang <wang.chi@microsoft.com> * update gitignore * update index.mdx * update authors.yml by adding Negar and Julia * remove md file * remove md file * update gitignore * update authors file * pre-commit checks * pre-commit checks on authors.yml * pre-commit checks on authors.yml * update index.mdx * update authors.yml by adding Negar and Julia * updated the blog-post version 1 * updated the blog-post: TL;DR is ready * updated the blog-post: first part of introduction is ready * updated figures: typos on fig 1, changed terminology on the fig 2 * upadated the Framework part * fixed redering issues * upload zip file instead of single samples * update prealgebra.zip * update * upload * update z * update naming * update zip * update the agenteval notebook * update the notebook - removing unmercenary logs * updated fig 1 and references to it * updated fig 1 * incorporated PR comments * merged agenteval branch * final changes to the blog * updated taxonomy * update notebook * minor changes to the blog * Fixed formatting * Update the link in agenteval_cq_math.ipynb * update the blog and link in notebook * Update index.mdx * change folder name * Changes to be committed: modified: OAI_CONFIG_LIST_sample.txt * add sample OAI file * fix the url link to colab and typos * fix the url link to colab and typos * add authors * update profile pic * "update authors" * fixing the problem in test_groupchat.py * update the title lower case * reverting changes in setup.py * rerun pre-commit --------- Co-authored-by: Negar Arabzadeh <ngr.arabzadeh@gmail.com> Co-authored-by: Julia Kiseleva <jukisele@microsoft.com> Co-authored-by: afourney <adamfo@microsoft.com> Co-authored-by: Chi Wang <wang.chi@microsoft.com> Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu>
This commit is contained in:
parent
07646d448c
commit
19c7da2dd1
3
.gitignore
vendored
3
.gitignore
vendored
@ -167,3 +167,6 @@ wolfram.txt
|
||||
|
||||
# DB on disk for TeachableAgent
|
||||
tmp/
|
||||
|
||||
# Storage for the AgentEval output
|
||||
test/test_files/agenteval-in-out/out/
|
||||
|
789
notebook/agenteval_cq_math.ipynb
Normal file
789
notebook/agenteval_cq_math.ipynb
Normal file
File diff suppressed because one or more lines are too long
@ -0,0 +1,26 @@
|
||||
{
|
||||
"Problem Interpretation": {
|
||||
"description": "Ability to correctly interpret the problem.",
|
||||
"accepted_values": ["completely off", "slightly relevant", "relevant", "mostly accurate", "completely accurate"]
|
||||
},
|
||||
"Mathematical Methodology": {
|
||||
"description": "Adequacy of the chosen mathematical or algorithmic methodology for the question",
|
||||
"accepted_values": ["inappropriate", "barely adequate", "adequate", "mostly effective", "completely effective"]
|
||||
},
|
||||
"Calculation Correctness": {
|
||||
"description": "Accuracy of calculations made and solutions given",
|
||||
"accepted_values": ["completely incorrect", "mostly incorrect", "neither", "mostly correct", "completely correct"]
|
||||
},
|
||||
"Explanation Clarity": {
|
||||
"description": "Clarity and comprehensibility of explanations, including language use and structure",
|
||||
"accepted_values": ["not at all clear", "slightly clear", "moderately clear", "very clear", "completely clear"]
|
||||
},
|
||||
"Code Efficiency": {
|
||||
"description": "Quality of code in terms of efficiency and elegance",
|
||||
"accepted_values": ["not at all efficient", "slightly efficient", "moderately efficient", "very efficient", "extremely efficient"]
|
||||
},
|
||||
"Code Correctness": {
|
||||
"description": "Correctness of the provided code",
|
||||
"accepted_values": ["completely incorrect", "mostly incorrect", "partly correct", "mostly correct", "completely correct"]
|
||||
}
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
{
|
||||
"sample_algebra.json": {
|
||||
"actual_success": "true",
|
||||
"estimated_performance": {
|
||||
"Problem Interpretation": "completely accurate",
|
||||
"Mathematical Methodology": "completely effective",
|
||||
"Calculation Correctness": "completely correct",
|
||||
"Explanation Clarity": "very clear",
|
||||
"Code Efficiency": "very efficient",
|
||||
"Code Correctness": "completely correct"
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,32 @@
|
||||
{
|
||||
"problem": "Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.",
|
||||
"level": "Level 5",
|
||||
"type": "Algebra",
|
||||
"solution": "We have \\begin{align*} (2x+10)(x+3)&<(3x+9)(x+8) \\quad \\Rightarrow\n\\\\ 2(x+5)(x+3)&<3(x+3)(x+8) \\quad \\Rightarrow\n\\\\ 2(x+5)(x+3)-3(x+3)(x+8)&<0 \\quad \\Rightarrow\n\\\\ (2x+10-(3x+24))(x+3)&<0 \\quad \\Rightarrow\n\\\\ (-x-14)(x+3)&<0 \\quad \\Rightarrow\n\\\\ (x+14)(x+3)&>0.\n\\end{align*} This inequality is satisfied if and only if $(x+14)$ and $(x+3)$ are either both positive or both negative. Both factors are positive for $x>-3$ and both factors are negative for $x<-14$. When $-14<x<-3$, one factor is positive and the other negative, so their product is negative. Therefore, the range of $x$ that satisfies the inequality is $ \\boxed{(-\\infty, -14)\\cup(-3,\\infty)} $.",
|
||||
"correct_ans": "(-\\infty, -14)\\cup(-3,\\infty)",
|
||||
"problem_id": "0",
|
||||
"response_with_ans": "I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\n\n```python\nfrom sympy import symbols, factor\r\nx = symbols('x')\r\nfactorized = factor(x**2 + 17*x + 42)\r\nfactorized\r\n```\n\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.",
|
||||
"round": 1,
|
||||
"messages": [
|
||||
{
|
||||
"content": "Find all $x$ that satisfy the inequality $(2x+10)(x+3)<(3x+9)(x+8)$. Express your answer in interval notation.",
|
||||
"role": "user"
|
||||
},
|
||||
{
|
||||
"content": "First, let's simplify and isolate the inequality to solve for $x$. For this, we need to expand the expressions on both sides and then move everything to one side.\n\nStep 1: Expand the expressions on both sides\n$(2x+10)(x+3) = 2x^2 + 6x + 10x + 30 = 2x^2 + 16x + 30$\n$(3x+9)(x+8) = 3x^2 + 24x + 9x + 72 = 3x^2 + 33x + 72$\n\nStep 2: Move everything to one side\n$0 < 3x^2 + 33x + 72 - (2x^2 + 16x + 30) = x^2 + 17x + 42$\n\nNow we need to find the factorized form of the quadratic inequality $x^2 + 17x + 42 > 0$ and then find intervals where the inequality holds true.\n\nLet's first find the factorized form of the quadratic:",
|
||||
"role": "assistant"
|
||||
},
|
||||
{
|
||||
"content": "",
|
||||
"role": "user"
|
||||
},
|
||||
{
|
||||
"content": "I apologize for the confusion in my previous message. Let's find the factorized form of the quadratic using Python code.\n\n```python\nfrom sympy import symbols, factor\r\nx = symbols('x')\r\nfactorized = factor(x**2 + 17*x + 42)\r\nfactorized\r\n```\n\nPlease run the Python code above to find the factorized form of the quadratic, and we can proceed to determine the intervals where the inequality holds true.",
|
||||
"role": "assistant"
|
||||
}
|
||||
],
|
||||
"time": 24.91333508491516,
|
||||
"check_result": "the reply doesn't contain an answer.",
|
||||
"is_correct": false,
|
||||
"trial": -1
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
{
|
||||
"problem": "What is the sum of all the distinct positive two-digit factors of 144?",
|
||||
"level": "Level 5",
|
||||
"type": "Number Theory",
|
||||
"solution": "Prime factorize $144=2^4\\cdot3^2$. The sum of the positive two-digit factors of 144 is $2^4+2\\cdot3^2+2^2\\cdot3+2^2\\cdot3^2+2^3\\cdot3+2^3\\cdot3^2+2^4\\cdot3=\\boxed{226}.$",
|
||||
"correct_ans": "226",
|
||||
"problem_id": "0",
|
||||
"response_with_ans": "To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\n\nHere's a Python script to accomplish this:\n\n```python\ntwo_digit_factors = []\n\nfor i in range(10, 100):\n if 144 % i == 0:\n two_digit_factors.append(i)\n\nsum_of_factors = sum(two_digit_factors)\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\n```\n\nPlease run this script to find the desired sum.",
|
||||
"round": 0,
|
||||
"messages": [
|
||||
{
|
||||
"content": "What is the sum of all the distinct positive two-digit factors of 144?",
|
||||
"role": "user"
|
||||
},
|
||||
{
|
||||
"content": "To find the sum of all the distinct positive two-digit factors of 144, we need to first find all these factors. We can do this by iterating through the numbers from 10 to 99 and checking if they are factors of 144. Then, we can sum these factors and print their sum.\n\nHere's a Python script to accomplish this:\n\n```python\ntwo_digit_factors = []\n\nfor i in range(10, 100):\n if 144 % i == 0:\n two_digit_factors.append(i)\n\nsum_of_factors = sum(two_digit_factors)\nprint(\"The sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\n```\n\nPlease run this script to find the desired sum.",
|
||||
"role": "assistant"
|
||||
}
|
||||
],
|
||||
"time": 11.140539407730103,
|
||||
"check_result": "the answer is not directly provided in the reply, but i can extract it by running the given python code:\n\n```python\ntwo_digit_factors = []\n\nfor i in range(10, 100):\n if 144 % i == 0:\n two_digit_factors.append(i)\n\nsum_of_factors = sum(two_digit_factors)\nprint(\"the sum of all the distinct positive two-digit factors of 144 is:\", sum_of_factors)\n```\n\nafter running this code, the result is:\nthe sum of all the distinct positive two-digit factors of 144 is: 226\n\nthe answer is 226. the ground truth answer is 226. therefore,\n\nthe answer is correct.",
|
||||
"is_correct": true,
|
||||
"trial": -1
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
{
|
||||
"problem": "Find $24^{-1} \\pmod{11^2}$. That is, find the residue $b$ for which $24b \\equiv 1\\pmod{11^2}$.\n\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive.",
|
||||
"level": "Level 5",
|
||||
"type": "Number Theory",
|
||||
"solution": "Since $5 \\times 24 = 120 = 121 - 1$, it follows that $-5 \\times 24 \\equiv 1 \\pmod{121}$. Adding 121 to $-5$ to make it positive, we find $(-5 + 121) \\times 24 \\equiv 116 \\times 24 \\equiv 1 \\pmod{121}$, so it follows that the modular inverse of $24$ is $\\boxed{116}$ when taken modulo $121$.",
|
||||
"correct_ans": "116",
|
||||
"problem_id": "5",
|
||||
"response_with_ans": "To find the modular inverse of 24 modulo 11^2, we can use the Extended Euclidean Algorithm. Here is a Python function to compute the modular inverse using this algorithm:\n\n```python\ndef mod_inverse(a, m):\n g, x, _ = extended_gcd(a, m)\n if g != 1:\n raise Exception(f\"{a} and {m} are not coprime.\")\n return x % m\n\ndef extended_gcd(a, b):\n if a == 0:\n return b, 0, 1\n else:\n g, x, y = extended_gcd(b % a, a)\n return g, y - (b // a) * x, x\n```\n\nLet's use the above function to find the modular inverse of 24 modulo 11^2:\n\n```python\na = 24\nm = 11**2\nmod_inverse(a, m)\n```\n\nI will execute the above code to find the modular inverse of 24 modulo 11^2.",
|
||||
"round": 0,
|
||||
"messages": [
|
||||
{
|
||||
"content": "Find $24^{-1} \\pmod{11^2}$. That is, find the residue $b$ for which $24b \\equiv 1\\pmod{11^2}$.\n\nExpress your answer as an integer from $0$ to $11^2-1$, inclusive.",
|
||||
"role": "user"
|
||||
},
|
||||
{
|
||||
"content": "To find the modular inverse of 24 modulo 11^2, we can use the Extended Euclidean Algorithm. Here is a Python function to compute the modular inverse using this algorithm:\n\n```python\ndef mod_inverse(a, m):\n g, x, _ = extended_gcd(a, m)\n if g != 1:\n raise Exception(f\"{a} and {m} are not coprime.\")\n return x % m\n\ndef extended_gcd(a, b):\n if a == 0:\n return b, 0, 1\n else:\n g, x, y = extended_gcd(b % a, a)\n return g, y - (b // a) * x, x\n```\n\nLet's use the above function to find the modular inverse of 24 modulo 11^2:\n\n```python\na = 24\nm = 11**2\nmod_inverse(a, m)\n```\n\nI will execute the above code to find the modular inverse of 24 modulo 11^2.",
|
||||
"role": "assistant"
|
||||
}
|
||||
],
|
||||
"time": 13.481226921081543,
|
||||
"check_result": "let me verify the modular inverse of 24 modulo 11^2.\n\nafter executing the provided python code:\n\n```python\na = 24\nm = 11**2\nmod_inverse(a, m)\n```\n\nthe answer is found to be: 116.\n\nthe answer is correct.",
|
||||
"is_correct": true,
|
||||
"trial": -1
|
||||
}
|
BIN
website/blog/2023-11-20-AgentEval/img/agenteval-CQ.png
Normal file
BIN
website/blog/2023-11-20-AgentEval/img/agenteval-CQ.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 171 KiB |
BIN
website/blog/2023-11-20-AgentEval/img/math-problems-plot.png
Normal file
BIN
website/blog/2023-11-20-AgentEval/img/math-problems-plot.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 91 KiB |
BIN
website/blog/2023-11-20-AgentEval/img/tasks-taxonomy.png
Normal file
BIN
website/blog/2023-11-20-AgentEval/img/tasks-taxonomy.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 98 KiB |
151
website/blog/2023-11-20-AgentEval/index.mdx
Normal file
151
website/blog/2023-11-20-AgentEval/index.mdx
Normal file
@ -0,0 +1,151 @@
|
||||
---
|
||||
title: How to Assess Utility of LLM-powered Applications?
|
||||
authors:
|
||||
- julianakiseleva
|
||||
- narabzad
|
||||
tags: [LLM, GPT, evaluation, task utility]
|
||||
---
|
||||
|
||||
|
||||

|
||||
|
||||
<p align="center"><em>Fig.1 illustrates the general flow of AgentEval</em></p>
|
||||
|
||||
**TL;DR:**
|
||||
* As a developer of an LLM-powered application, how can you assess the utility it brings to end users while helping them with their tasks?
|
||||
* To shed light on the question above, we introduce `AgentEval` — the first version of the framework to assess the utility of any LLM-powered application crafted to assist users in specific tasks. AgentEval aims to simplify the evaluation process by automatically proposing a set of criteria tailored to the unique purpose of your application. This allows for a comprehensive assessment, quantifying the utility of your application against the suggested criteria.
|
||||
* We demonstrate how `AgentEval` work using [math problems dataset](https://microsoft.github.io/autogen/blog/2023/06/28/MathChat) as an example in the [following notebook](https://github.com/microsoft/autogen/blob/main/notebook/agenteval_cq_math.ipynb). Any feedback would be useful for future development. Please contact us on our [Discord](http://aka.ms/autogen-dc).
|
||||
|
||||
|
||||
## Introduction
|
||||
|
||||
AutoGen aims to simplify the development of LLM-powered multi-agent systems for various applications, ultimately making end users' lives easier by assisting with their tasks. Next, we all yearn to understand how our developed systems perform, their utility for users, and, perhaps most crucially, how we can enhance them. Directly evaluating multi-agent systems poses challenges as current approaches predominantly rely on success metrics – essentially, whether the agent accomplishes tasks. However, comprehending user interaction with a system involves far more than success alone. Take math problems, for instance; it's not merely about the agent solving the problem. Equally significant is its ability to convey solutions based on various criteria, including completeness, conciseness, and the clarity of the provided explanation. Furthermore, success isn't always clearly defined for every task.
|
||||
|
||||
Rapid advances in LLMs and multi-agent systems have brought forth many emerging capabilities that we're keen on translating into tangible utilities for end users. We introduce the first version of `AgentEval` framework - a tool crafted to empower developers in swiftly gauging the utility of LLM-powered applications designed to help end users accomplish the desired task.
|
||||
|
||||
|
||||

|
||||
<p align="center"><em>Fig. 2 provides an overview of the tasks taxonomy</em></p>
|
||||
|
||||
|
||||
Let's first look into an overview of the suggested task taxonomy that a multi-agent system can be designed for. In general, the tasks can be split into two types, where:
|
||||
* _Success is not clearly defined_ - refer to instances when users utilize a system in an assistive manner, seeking suggestions rather than expecting the system to solve the task. For example, a user might request the system to generate an email. In many cases, this generated content serves as a template that the user will later edit. However, defining success precisely for such tasks is relatively complex.
|
||||
* _Success is clearly defined_ - refer to instances where we can clearly define whether a system solved the task or not. Consider agents that assist in accomplishing household tasks, where the definition of success is clear and measurable. This category can be further divided into two separate subcategories:
|
||||
* _The optimal solution exits_ - these are tasks where only one solution is possible. For example, if you ask your assistant to turn on the light, the success of this task is clearly defined, and there is only one way to accomplish it.
|
||||
* _Multiple solutions exist_ - increasingly, we observe situations where multiple trajectories of agent behavior can lead to either success or failure. In such cases, it is crucial to differentiate between the various successful and unsuccessful trajectories. For example, when you ask the agent to suggest you a food recipe or tell you a joke.
|
||||
|
||||
In our `AgentEval` framework, we are currently focusing on tasks where _Success is clearly defined_. Next, we will introduce the suggeted framework.
|
||||
|
||||
## `AgentEval` Framework
|
||||
|
||||
Our previous research on [assistive agents in Minecraft](https://github.com/microsoft/iglu-datasets) suggested that the most optimal way to obtain human judgments is to present humans with two agents side by side and ask for preferences. In this setup of pairwise comparison, humans can develop criteria to explain why they prefer the behavior of one agent over another. For instance, _'the first agent was faster in execution,'_ or _'the second agent moves more naturally.'_ So, the comparative nature led humans to come up with a list of criteria that helps to infer the utility of the task. With this idea in mind, we designed `AgentEval` (shown in Fig. 1), where we employ LLMs to help us understand, verify, and assess task *utility* for the multi-agent system. Namely:
|
||||
|
||||
* The goal of `CriticAgent` is to suggest the list of criteria (Fig. 1), that can be used to assess task utility. This is an example of how `CriticAgent` is defined using `Autogen`:
|
||||
|
||||
```python
|
||||
critic = autogen.AssistantAgent(
|
||||
name="critic",
|
||||
llm_config={"config_list": config_list},
|
||||
system_message="""You are a helpful assistant. You suggest criteria for evaluating different tasks. They should be distinguishable, quantifiable, and not redundant.
|
||||
Convert the evaluation criteria into a dictionary where the keys are the criteria.
|
||||
The value of each key is a dictionary as follows {"description": criteria description, "accepted_values": possible accepted inputs for this key}
|
||||
Make sure the keys are criteria for assessing the given task. "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels. "description" includes the criterion description.
|
||||
Return only the dictionary."""
|
||||
)
|
||||
```
|
||||
|
||||
Next, the critic is given successful and failed examples of the task execution; then, it is able to return a list of criteria (Fig. 1). For reference, use the [following notebook](https://github.com/microsoft/autogen/blob/main/notebook/agenteval_cq_math.ipynb).
|
||||
|
||||
* The goal of `QuantifierAgent` is to quantify each of the suggested criteria (Fig. 1), providing us with an idea of the utility of this system for the given task. Here is an example of how it can be defined:
|
||||
|
||||
```python
|
||||
quantifier = autogen.AssistantAgent(
|
||||
name="quantifier",
|
||||
llm_config={"config_list": config_list},
|
||||
system_message = """You are a helpful assistant. You quantify the output of different tasks based on the given criteria.
|
||||
The criterion is given in a dictionary format where each key is a distinct criteria.
|
||||
The value of each key is a dictionary as follows {"description": criteria description , "accepted_values": possible accepted inputs for this key}
|
||||
You are going to quantify each of the criteria for a given task based on the task description.
|
||||
Return a dictionary where the keys are the criteria and the values are the assessed performance based on accepted values for each criteria.
|
||||
Return only the dictionary."""
|
||||
|
||||
)
|
||||
```
|
||||
|
||||
## `AgentEval` Results based on Math Problems Dataset
|
||||
|
||||
As an example, after running CriticAgent, we obtained the following criteria to verify the results for math problem dataset:
|
||||
|
||||
| Criteria | Description | Accepted Values|
|
||||
|-----------|-----|----------------|
|
||||
| Problem Interpretation | Ability to correctly interpret the problem | ["completely off", "slightly relevant", "relevant", "mostly accurate", "completely accurate"]|
|
||||
| Mathematical Methodology | Adequacy of the chosen mathematical or algorithmic methodology for the question | ["inappropriate", "barely adequate", "adequate", "mostly effective", "completely effective"] |
|
||||
| Calculation Correctness | Accuracy of calculations made and solutions given | ["completely incorrect", "mostly incorrect", "neither", "mostly correct", "completely correct"] |
|
||||
| Explanation Clarity | Clarity and comprehensibility of explanations, including language use and structure | ["not at all clear", "slightly clear", "moderately clear", "very clear", "completely clear"] |
|
||||
| Code Efficiency | Quality of code in terms of efficiency and elegance |["not at all efficient", "slightly efficient", "moderately efficient", "very efficient", "extremely efficient"] |
|
||||
| Code Correctness | Correctness of the provided code | ["completely incorrect", "mostly incorrect", "partly correct", "mostly correct", "completely correct"]
|
||||
|
||||
|
||||
Then, after running QuantifierAgent, we obtained the results presented in Fig. 3, where you can see three models:
|
||||
* AgentChat
|
||||
* ReAct
|
||||
* GPT-4 Vanilla Solver
|
||||
|
||||
Lighter colors represent estimates for failed cases, and brighter colors show how discovered criteria were quantified.
|
||||
|
||||

|
||||
<p align="center"><em>Fig.3 presents results based on overall mathproblems datase `_s` stands for successful cases, `_f` - stands for failed cases</em></p>
|
||||
|
||||
We note that while applying agentEval to math problems, the agent was not exposed to any ground truth information about the problem. As such, this figure illustrates an estimated performance of the three different agents, namely, Autogen (blue), Gpt-4 (red), and ReAct (green). We observe that by comparing the performance of any of the three agents in successful cases (dark bars of any color) versus unsuccessful cases (lighter version of the same bar), we note that AgentEval was able to assign higher quantification to successful cases than that of failed ones. This observation verifies AgentEval's ability for task utility prediction. Additionally, AgentEval allows us to go beyond just a binary definition of success, enabling a more in-depth comparison between successful and failed cases.
|
||||
|
||||
It's important not only to identify what is not working but also to recognize what and why actually went well.
|
||||
|
||||
## Limitations and Future Work
|
||||
The current implementation of `AgentEval` has a number of limitations which are planning to overcome in the future:
|
||||
* The list of criteria varies per run (unless you store a seed). We would recommend to run `CriticAgent` at least two times, and pick criteria you think is important for your domain.
|
||||
* The results of the `QuantifierAgent` can vary with each run, so we recommend conducting multiple runs to observe the extent of result variations.
|
||||
|
||||
To mitigate the limitations mentioned above, we are working on VerifierAgent, whose goal is to stabilize the results and provide additional explanations.
|
||||
|
||||
## Summary
|
||||
`CriticAgent` and `QuantifierAgent` can be applied to the logs of any type of application, providing you with an in-depth understanding of the utility your solution brings to the user for a given task.
|
||||
|
||||
We would love to hear about how AgentEval works for your application. Any feedback would be useful for future development. Please contact us on our [Discord](http://aka.ms/autogen-dc).
|
||||
|
||||
|
||||
## Previous Research
|
||||
|
||||
```
|
||||
@InProceedings{pmlr-v176-kiseleva22a,
|
||||
title = "Interactive Grounded Language Understanding in a Collaborative Environment: IGLU 2021",
|
||||
author = "Kiseleva, Julia and Li, Ziming and Aliannejadi, Mohammad and Mohanty, Shrestha and ter Hoeve, Maartje and Burtsev, Mikhail and Skrynnik, Alexey and Zholus, Artem and Panov, Aleksandr and Srinet, Kavya and Szlam, Arthur and Sun, Yuxuan and Hofmann, Katja and C{\^o}t{\'e}, Marc-Alexandre and Awadallah, Ahmed and Abdrazakov, Linar and Churin, Igor and Manggala, Putra and Naszadi, Kata and van der Meer, Michiel and Kim, Taewoon",
|
||||
booktitle = "Proceedings of the NeurIPS 2021 Competitions and Demonstrations Track",
|
||||
pages = "146--161",
|
||||
year = 2022,
|
||||
editor = "Kiela, Douwe and Ciccone, Marco and Caputo, Barbara",
|
||||
volume = 176,
|
||||
series = "Proceedings of Machine Learning Research",
|
||||
month = "06--14 Dec",
|
||||
publisher = "PMLR",
|
||||
pdf = {https://proceedings.mlr.press/v176/kiseleva22a/kiseleva22a.pdf},
|
||||
url = {https://proceedings.mlr.press/v176/kiseleva22a.html}.
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
```
|
||||
@InProceedings{pmlr-v220-kiseleva22a,
|
||||
title = "Interactive Grounded Language Understanding in a Collaborative Environment: Retrospective on Iglu 2022 Competition",
|
||||
author = "Kiseleva, Julia and Skrynnik, Alexey and Zholus, Artem and Mohanty, Shrestha and Arabzadeh, Negar and C\^{o}t\'e, Marc-Alexandre and Aliannejadi, Mohammad and Teruel, Milagro and Li, Ziming and Burtsev, Mikhail and ter Hoeve, Maartje and Volovikova, Zoya and Panov, Aleksandr and Sun, Yuxuan and Srinet, Kavya and Szlam, Arthur and Awadallah, Ahmed and Rho, Seungeun and Kwon, Taehwan and Wontae Nam, Daniel and Bivort Haiek, Felipe and Zhang, Edwin and Abdrazakov, Linar and Qingyam, Guo and Zhang, Jason and Guo, Zhibin",
|
||||
booktitle = "Proceedings of the NeurIPS 2022 Competitions Track",
|
||||
pages = "204--216",
|
||||
year = 2022,
|
||||
editor = "Ciccone, Marco and Stolovitzky, Gustavo and Albrecht, Jacob",
|
||||
volume = 220,
|
||||
series = "Proceedings of Machine Learning Research",
|
||||
month = "28 Nov--09 Dec",
|
||||
publisher = "PMLR",
|
||||
pdf = "https://proceedings.mlr.press/v220/kiseleva22a/kiseleva22a.pdf",
|
||||
url = "https://proceedings.mlr.press/v220/kiseleva22a.html".
|
||||
}
|
||||
```
|
@ -75,3 +75,15 @@ jieyuz2:
|
||||
title: PhD student at University of Washington
|
||||
url: https://jieyuz2.github.io/
|
||||
image_url: https://github.com/jieyuz2.png
|
||||
|
||||
julianakiseleva:
|
||||
name: Julia Kiseleva
|
||||
title: Senior Researcher at Microsoft Research
|
||||
url: https://github.com/julianakiseleva/
|
||||
image_url: https://avatars.githubusercontent.com/u/5908392?v=4
|
||||
|
||||
narabzad:
|
||||
name: Negar Arabzadeh
|
||||
title: PhD student at the University of Waterloo
|
||||
url: https://www.negara.me/
|
||||
image_url: https://github.com/Narabzad.png
|
||||
|
Loading…
x
Reference in New Issue
Block a user