feat: add context for aiAction (#528)

This commit is contained in:
yuyutaotao 2025-04-02 20:34:23 +08:00 committed by GitHub
parent 0c0675ba4b
commit bcdf90b997
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 323 additions and 143 deletions

View File

@ -15,6 +15,7 @@ These Agents share some common constructor parameters:
* `generateReport: boolean`: If true, a report file will be generated. (Default: true)
* `autoPrintReportMsg: boolean`: If true, report messages will be printed. (Default: true)
* `cacheId: string | undefined`: If provided, this cacheId will be used to save or match the cache. (Default: undefined, means cache feature is disabled)
* `actionContext: string`: Some background knowledge that should be sent to the AI model when calling `agent.aiAction()`, like 'close the cookie consent dialog first if it exists' (Default: undefined)
In Puppeteer, there is an additional parameter:
@ -366,6 +367,26 @@ console.log(result);
For more information about YAML scripts, please refer to [Automate with Scripts in YAML](./automate-with-scripts-in-yaml).
:::
### `agent.setAIActionContext()`
This method allows you to set the background knowledge that should be sent to the AI model when calling `agent.aiAction()`.
* Type
```typescript
function setAIActionContext(actionContext: string): void;
```
* Parameters:
* `actionContext: string` - The background knowledge that should be sent to the AI model.
* Example:
```typescript
await agent.setAIActionContext('Close the cookie consent dialog first if it exists');
```
## Properties
### `.reportFile`

View File

@ -158,6 +158,9 @@ target:
# boolean, if close the new tabs after the bridge is disconnected, optional, default is false
closeNewTabsAfterDisconnect: <boolean>
# string, the background knowledge to send to the AI model when calling aiAction, optional
aiActionContext: <string>
```
### `tasks` part

View File

@ -14,6 +14,7 @@ Midscene 中每个 Agent 都有自己的构造函数。
* `generateReport: boolean`: 如果为 true则生成报告文件。默认值为 true。
* `autoPrintReportMsg: boolean`: 如果为 true则打印报告消息。默认值为 true。
* `cacheId: string | undefined`: 如果配置,则使用此 cacheId 保存或匹配缓存。默认值为 undefined也就是不启用缓存。
* `actionContext: string`: 调用 `agent.aiAction()` 时,发送给 AI 模型的背景知识,比如 '有 cookie 对话框时先关闭它',默认值为空。
在 puppeteer 中,还有一个额外的参数:
@ -364,6 +365,25 @@ console.log(result);
更多关于 YAML 脚本的信息,请参考 [Automate with Scripts in YAML](./automate-with-scripts-in-yaml)。
:::
### `agent.setAIActionContext()`
这个方法允许你设置在调用 `agent.aiAction()` 时,发送给 AI 模型的背景知识。
* 类型
```typescript
function setAIActionContext(actionContext: string): void;
```
* 参数:
* `actionContext: string` - 要发送给 AI 模型的背景知识。
* 示例:
```typescript
await agent.setAIActionContext('如果 “使用cookie” 对话框存在,先关闭它');
```
## 属性
### `.reportFile`

View File

@ -158,6 +158,9 @@ target:
# 是否在桥接断开时关闭新创建的标签页,可选,默认 false
closeNewTabsAfterDisconnect: <boolean>
# 在调用 aiAction 时发送给 AI 模型的背景知识,可选
aiActionContext: <string>
```
### `tasks` 部分

View File

@ -20,6 +20,7 @@ export async function plan(
userInstruction: string,
opts: {
log?: string;
actionContext?: string;
context: UIContext;
callAI?: typeof callAiFn<PlanningAIResponse>;
},
@ -32,6 +33,7 @@ export async function plan(
const taskBackgroundContextText = generateTaskBackgroundContext(
userInstruction,
opts.log,
opts.actionContext,
);
const userInstructionPrompt = await automationUserPrompt().format({
pageDescription,
@ -39,7 +41,7 @@ export async function plan(
});
let imagePayload = screenshotBase64WithElementMarker || screenshotBase64;
if (vlLocateMode()) {
if (vlLocateMode() === 'qwen-vl') {
imagePayload = await paddingToMatchBlockByBase64(imagePayload);
}

View File

@ -309,12 +309,18 @@ export const planSchema: ResponseFormatJSONSchema = {
export const generateTaskBackgroundContext = (
userInstruction: string,
log?: string,
userActionContext?: string,
) => {
if (log) {
return `
Here is the user's instruction:
<instruction>
${userInstruction}
<high_priority_knowledge>
${userActionContext}
</high_priority_knowledge>
${userInstruction}
</instruction>
These are the logs from previous executions, which indicate what was done in the previous actions.
@ -328,8 +334,13 @@ ${log}
return `
Here is the user's instruction:
<instruction>
${userInstruction}
</instruction>`;
<high_priority_knowledge>
${userActionContext}
</high_priority_knowledge>
${userInstruction}
</instruction>
`;
};
export const automationUserPrompt = () => {

View File

@ -27,6 +27,10 @@ export interface MidsceneYamlTask {
}
export interface MidsceneYamlScriptEnv {
output?: string;
aiActionContext?: string;
// for web only
serve?: string;
url: string;
@ -40,7 +44,6 @@ export interface MidsceneYamlScriptEnv {
continueOnNetworkIdleError?: boolean; // should continue if failed to wait for network idle, true for default
};
cookie?: string;
output?: string;
forceSameTabNavigation?: boolean; // if track the newly opened tab, true for default in yaml script
// bridge mode config

View File

@ -304,8 +304,13 @@ Reason:
exports[`system prompts > planning - background context 1`] = `
"
Here is the user's instruction:
<instruction>
THIS IS USER INSTRUCTION
<high_priority_knowledge>
THIS IS BACKGROUND PROMPT
</high_priority_knowledge>
THIS IS USER INSTRUCTION
</instruction>
These are the logs from previous executions, which indicate what was done in the previous actions.

View File

@ -31,6 +31,7 @@ describe('system prompts', () => {
const context = generateTaskBackgroundContext(
'THIS IS USER INSTRUCTION',
'THIS IS WHAT HAS BEEN DONE',
'THIS IS BACKGROUND PROMPT',
);
expect(context).toMatchSnapshot();
});
@ -42,6 +43,7 @@ describe('system prompts', () => {
const result = await prompt.format({
pageDescription: 'THIS IS PAGE DESCRIPTION',
taskBackgroundContext: 'THIS IS BACKGROUND CONTEXT',
userActionContext: 'THIS IS BACKGROUND PROMPT',
});
expect(result).toMatchSnapshot();
});

View File

@ -1,6 +1,81 @@
{
"testDataPath": "todo",
"testCases": [
{
"prompt": "type 'hello' in the input box, sleep 5s, hit enter",
"response_planning": {
"what_the_user_wants_to_do_next_by_instruction": "type 'hello' in the input box",
"log": "I will use action Input to type 'hello' in the input box first.",
"more_actions_needed_by_instruction": true,
"action": {
"type": "Input",
"locate": {
"prompt": "input box for adding a new todo",
"bbox": [513, 128, 1067, 197]
},
"param": {
"value": "hello"
}
},
"sleep": 5000,
"actions": [
{
"type": "Input",
"locate": {
"prompt": "input box for adding a new todo",
"bbox": [513, 128, 1067, 197]
},
"param": {
"value": "hello"
}
}
],
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 128,\n 1067,\n 197\n ],\n \"prompt\": \"input box for adding a new todo\"\n },\n \"param\": {\n \"value\": \"hello\"\n }\n },\n \"sleep\": 5000\n}",
"usage": {
"prompt_tokens": 1979,
"completion_tokens": 147,
"total_tokens": 2126,
"completion_tokens_details": {
"text_tokens": 147
},
"prompt_tokens_details": {
"text_tokens": 781,
"image_tokens": 1198
}
}
},
"response_bbox": [513, 128, 1067, 194],
"annotation_index_id": 1,
"response_rect": {
"left": 513,
"top": 128,
"width": 554,
"height": 69
}
},
{
"prompt": "if there are five entries in the list, click the delete button of the second entry and wait 3s. Otherwise, do nothing.",
"response_planning": {
"what_the_user_wants_to_do_next_by_instruction": "Check if there are five entries in the list and click the delete button of the second entry if so.",
"log": "There are only three entries in the list, so no action is needed according to the instruction.",
"more_actions_needed_by_instruction": false,
"action": null,
"actions": [],
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"Check if there are five entries in the list and click the delete button of the second entry if so.\",\n \"log\": \"There are only three entries in the list, so no action is needed according to the instruction.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": null\n}",
"usage": {
"prompt_tokens": 1990,
"completion_tokens": 82,
"total_tokens": 2072,
"completion_tokens_details": {
"text_tokens": 82
},
"prompt_tokens_details": {
"text_tokens": 792,
"image_tokens": 1198
}
}
}
},
{
"prompt": "type 'hello' in the input box, sleep 5s, hit enter",
"response_planning": {
@ -32,89 +107,14 @@
],
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 128,\n 1067,\n 194\n ],\n \"prompt\": \"input box for adding a new todo\"\n },\n \"param\": {\n \"value\": \"hello\"\n }\n },\n \"sleep\": 5000\n}",
"usage": {
"prompt_tokens": 1961,
"prompt_tokens": 1979,
"completion_tokens": 147,
"total_tokens": 2108,
"total_tokens": 2126,
"completion_tokens_details": {
"text_tokens": 147
},
"prompt_tokens_details": {
"text_tokens": 763,
"image_tokens": 1198
}
}
},
"response_bbox": [513, 128, 1067, 194],
"annotation_index_id": 1,
"response_rect": {
"left": 513,
"top": 128,
"width": 554,
"height": 66
}
},
{
"prompt": "if there are five entries in the list, click the delete button of the second entry and wait 3s. Otherwise, do nothing.",
"response_planning": {
"what_the_user_wants_to_do_next_by_instruction": "Check if there are five entries in the list and click the delete button of the second entry if so.",
"log": "There are only three entries in the list, so no action is needed according to the instruction.",
"more_actions_needed_by_instruction": false,
"action": null,
"actions": [],
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"Check if there are five entries in the list and click the delete button of the second entry if so.\",\n \"log\": \"There are only three entries in the list, so no action is needed according to the instruction.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": null\n}",
"usage": {
"prompt_tokens": 1972,
"completion_tokens": 82,
"total_tokens": 2054,
"completion_tokens_details": {
"text_tokens": 82
},
"prompt_tokens_details": {
"text_tokens": 774,
"image_tokens": 1198
}
}
}
},
{
"prompt": "type 'hello' in the input box, sleep 5s, hit enter",
"response_planning": {
"what_the_user_wants_to_do_next_by_instruction": "type 'hello' in the input box",
"log": "I will use action Input to type 'hello' in the input box first.",
"more_actions_needed_by_instruction": true,
"action": {
"type": "Input",
"locate": {
"prompt": "input box for adding new todo",
"bbox": [513, 128, 1067, 194]
},
"param": {
"value": "hello"
}
},
"sleep": 5000,
"actions": [
{
"type": "Input",
"locate": {
"prompt": "input box for adding new todo",
"bbox": [513, 128, 1067, 194]
},
"param": {
"value": "hello"
}
}
],
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 128,\n 1067,\n 194\n ],\n \"prompt\": \"input box for adding new todo\"\n },\n \"param\": {\n \"value\": \"hello\"\n }\n },\n \"sleep\": 5000\n}",
"usage": {
"prompt_tokens": 1961,
"completion_tokens": 146,
"total_tokens": 2107,
"completion_tokens_details": {
"text_tokens": 146
},
"prompt_tokens_details": {
"text_tokens": 763,
"text_tokens": 781,
"image_tokens": 1198
}
}
@ -150,16 +150,16 @@
}
}
],
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"hit enter\",\n \"log\": \"I will use action KeyboardPress to hit the Enter key.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"KeyboardPress\",\n \"param\": {\n \"value\": \"Enter\"\n }\n },\n \"sleep\": null\n}",
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"hit enter\",\n \"log\": \"I will use action KeyboardPress to hit the Enter key.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"KeyboardPress\",\n \"param\": {\n \"value\": \"Enter\"\n }\n },\n \"sleep\": 5000\n}",
"usage": {
"prompt_tokens": 2007,
"completion_tokens": 86,
"total_tokens": 2093,
"prompt_tokens": 2025,
"completion_tokens": 90,
"total_tokens": 2115,
"completion_tokens_details": {
"text_tokens": 86
"text_tokens": 90
},
"prompt_tokens_details": {
"text_tokens": 809,
"text_tokens": 827,
"image_tokens": 1198
}
}
@ -175,7 +175,7 @@
"type": "Input",
"locate": {
"prompt": "input box for adding a new todo",
"bbox": [513, 128, 1067, 194]
"bbox": [513, 127, 1068, 194]
},
"param": {
"value": "hello"
@ -186,23 +186,23 @@
"type": "Input",
"locate": {
"prompt": "input box for adding a new todo",
"bbox": [513, 128, 1067, 194]
"bbox": [513, 127, 1068, 194]
},
"param": {
"value": "hello"
}
}
],
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 128,\n 1067,\n 194\n ],\n \"prompt\": \"input box for adding a new todo\"\n },\n \"param\": {\n \"value\": \"hello\"\n }\n }\n}",
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 127,\n 1068,\n 194\n ],\n \"prompt\": \"input box for adding a new todo\"\n },\n \"param\": {\n \"value\": \"hello\"\n }\n }\n}",
"usage": {
"prompt_tokens": 1953,
"prompt_tokens": 1971,
"completion_tokens": 137,
"total_tokens": 2090,
"total_tokens": 2108,
"completion_tokens_details": {
"text_tokens": 137
},
"prompt_tokens_details": {
"text_tokens": 755,
"text_tokens": 773,
"image_tokens": 1198
}
}
@ -211,15 +211,15 @@
"annotation_index_id": 5,
"response_rect": {
"left": 513,
"top": 128,
"width": 554,
"height": 66
"top": 127,
"width": 555,
"height": 67
}
},
{
"prompt": "click OK",
"response_planning": {
"error": "Failed to plan actions: No 'OK' button found in the current view."
"error": "Failed to plan actions: There is no visible 'OK' button in the screenshot provided."
}
},
{
@ -233,14 +233,14 @@
"actions": [],
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"check if there is an 'OK' button and click it\",\n \"log\": \"There is no 'OK' button visible in the screenshot. No action needed.\",\n \"error\": null,\n \"more_actions_needed_by_instruction\": false,\n \"action\": null\n}",
"usage": {
"prompt_tokens": 1956,
"prompt_tokens": 1974,
"completion_tokens": 73,
"total_tokens": 2029,
"total_tokens": 2047,
"completion_tokens_details": {
"text_tokens": 73
},
"prompt_tokens_details": {
"text_tokens": 758,
"text_tokens": 776,
"image_tokens": 1198
}
}
@ -250,22 +250,22 @@
"prompt": "if there is an 'OK' button, click it. If not, try again in next step",
"response_planning": {
"what_the_user_wants_to_do_next_by_instruction": "check if there is an 'OK' button and click it if found",
"log": "I will check the screenshot for an 'OK' button. If not found, I will wait for the next step.",
"log": "I will check the screenshot for an 'OK' button. If not found, I will log that no action should be taken.",
"error": null,
"more_actions_needed_by_instruction": true,
"action": null,
"sleep": 1000,
"sleep": null,
"actions": [],
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"check if there is an 'OK' button and click it if found\",\n \"log\": \"I will check the screenshot for an 'OK' button. If not found, I will wait for the next step.\",\n \"error\": null,\n \"more_actions_needed_by_instruction\": true,\n \"action\": null,\n \"sleep\": 1000\n}",
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"check if there is an 'OK' button and click it if found\",\n \"log\": \"I will check the screenshot for an 'OK' button. If not found, I will log that no action should be taken.\",\n \"error\": null,\n \"more_actions_needed_by_instruction\": false,\n \"action\": null,\n \"sleep\": null\n}",
"usage": {
"prompt_tokens": 1965,
"completion_tokens": 97,
"total_tokens": 2062,
"prompt_tokens": 1983,
"completion_tokens": 95,
"total_tokens": 2078,
"completion_tokens_details": {
"text_tokens": 97
"text_tokens": 95
},
"prompt_tokens_details": {
"text_tokens": 767,
"text_tokens": 785,
"image_tokens": 1198
}
}
@ -276,35 +276,33 @@
"response_planning": {
"what_the_user_wants_to_do_next_by_instruction": "Move the mouse to the second item in the task list and click the delete button on the right of the second task",
"log": "I will use action Hover to move the mouse to the second item in the task list first.",
"error": null,
"more_actions_needed_by_instruction": true,
"action": {
"type": "Hover",
"locate": {
"prompt": "second item in the task list",
"bbox": [574, 269, 693, 298]
"bbox": [574, 276, 693, 296]
}
},
"sleep": 200,
"actions": [
{
"type": "Hover",
"locate": {
"prompt": "second item in the task list",
"bbox": [574, 269, 693, 298]
"bbox": [574, 276, 693, 296]
}
}
],
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"Move the mouse to the second item in the task list and click the delete button on the right of the second task\",\n \"log\": \"I will use action Hover to move the mouse to the second item in the task list first.\",\n \"error\": null,\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Hover\",\n \"locate\": {\n \"bbox_2d\": [\n 574,\n 269,\n 693,\n 298\n ],\n \"prompt\": \"second item in the task list\"\n }\n },\n \"sleep\": 200\n}",
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"Move the mouse to the second item in the task list and click the delete button on the right of the second task\",\n \"log\": \"I will use action Hover to move the mouse to the second item in the task list first.\",\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Hover\",\n \"locate\": {\n \"bbox_2d\": [\n 574,\n 276,\n 693,\n 296\n ],\n \"prompt\": \"second item in the task list\"\n }\n }\n}",
"usage": {
"prompt_tokens": 1968,
"completion_tokens": 154,
"total_tokens": 2122,
"prompt_tokens": 1986,
"completion_tokens": 139,
"total_tokens": 2125,
"completion_tokens_details": {
"text_tokens": 154
"text_tokens": 139
},
"prompt_tokens_details": {
"text_tokens": 770,
"text_tokens": 788,
"image_tokens": 1198
}
}
@ -313,22 +311,22 @@
"annotation_index_id": 9,
"response_rect": {
"left": 574,
"top": 269,
"top": 276,
"width": 119,
"height": 29
"height": 20
}
},
{
"prompt": "在任务框 input 输入 今天学习 JS按回车键",
"response_planning": {
"what_the_user_wants_to_do_next_by_instruction": "在任务框 input 输入 今天学习 JS按回车键",
"log": "我将使用 Input 动作来填写任务框中的内容。",
"log": "我将使用 Input 动作来填写任务框,并输入 '今天学习 JS'。",
"more_actions_needed_by_instruction": true,
"action": {
"type": "Input",
"locate": {
"prompt": "任务输入框",
"bbox": [513, 127, 1068, 197]
"prompt": "任务框的输入区域",
"bbox": [513, 128, 1067, 194]
},
"param": {
"value": "今天学习 JS"
@ -338,24 +336,24 @@
{
"type": "Input",
"locate": {
"prompt": "任务输入框",
"bbox": [513, 127, 1068, 197]
"prompt": "任务框的输入区域",
"bbox": [513, 128, 1067, 194]
},
"param": {
"value": "今天学习 JS"
}
}
],
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"在任务框 input 输入 今天学习 JS按回车键\",\n \"log\": \"我将使用 Input 动作来填写任务框中的内容。\",\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 127,\n 1068,\n 197\n ],\n \"prompt\": \"任务输入框\"\n },\n \"param\": {\n \"value\": \"今天学习 JS\"\n }\n }\n}",
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"在任务框 input 输入 今天学习 JS按回车键\",\n \"log\": \"我将使用 Input 动作来填写任务框,并输入 '今天学习 JS'。\",\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 128,\n 1067,\n 194\n ],\n \"prompt\": \"任务框的输入区域\"\n },\n \"param\": {\n \"value\": \"今天学习 JS\"\n }\n }\n}",
"usage": {
"prompt_tokens": 1959,
"completion_tokens": 139,
"total_tokens": 2098,
"prompt_tokens": 1977,
"completion_tokens": 146,
"total_tokens": 2123,
"completion_tokens_details": {
"text_tokens": 139
"text_tokens": 146
},
"prompt_tokens_details": {
"text_tokens": 761,
"text_tokens": 779,
"image_tokens": 1198
}
}
@ -364,10 +362,103 @@
"annotation_index_id": 10,
"response_rect": {
"left": 513,
"top": 127,
"width": 555,
"height": 70
"top": 128,
"width": 554,
"height": 66
}
},
{
"prompt": "Click the 'clear completed' button",
"action_context": "Always click the input box first before doing anything else",
"response_planning": {
"what_the_user_wants_to_do_next_by_instruction": "Click the 'clear completed' button",
"log": "I will use action Tap to click the input box first before clicking the 'clear completed' button.",
"error": null,
"more_actions_needed_by_instruction": true,
"action": {
"type": "Tap",
"locate": {
"prompt": "input box",
"bbox": [574, 156, 839, 176]
}
},
"sleep": 500,
"actions": [
{
"type": "Tap",
"locate": {
"prompt": "input box",
"bbox": [574, 156, 839, 176]
}
}
],
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"Click the 'clear completed' button\",\n \"log\": \"I will use action Tap to click the input box first before clicking the 'clear completed' button.\",\n \"error\": null,\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Tap\",\n \"locate\": {\n \"bbox_2d\": [\n 574,\n 156,\n 839,\n 176\n ],\n \"prompt\": \"input box\"\n }\n },\n \"sleep\": 500\n}",
"usage": {
"prompt_tokens": 1979,
"completion_tokens": 135,
"total_tokens": 2114,
"completion_tokens_details": {
"text_tokens": 135
},
"prompt_tokens_details": {
"text_tokens": 781,
"image_tokens": 1198
}
}
},
"response_rect": {
"left": 574,
"top": 156,
"width": 265,
"height": 20
},
"annotation_index_id": 11
},
{
"prompt": "Click the 'clear completed' button",
"action_context": "Always click the input box first before doing anything else",
"log": "I will use action Tap to click the input box first before doing anything else.",
"response_planning": {
"what_the_user_wants_to_do_next_by_instruction": "Click the 'clear completed' button",
"log": "I will use action Tap to click the 'clear completed' button next.",
"more_actions_needed_by_instruction": false,
"action": {
"type": "Tap",
"locate": {
"prompt": "'Clear completed' button",
"bbox": [946, 387, 1050, 404]
}
},
"actions": [
{
"type": "Tap",
"locate": {
"prompt": "'Clear completed' button",
"bbox": [946, 387, 1050, 404]
}
}
],
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"Click the 'clear completed' button\",\n \"log\": \"I will use action Tap to click the 'clear completed' button next.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Tap\",\n \"locate\": {\n \"bbox_2d\": [\n 946,\n 387,\n 1050,\n 404\n ],\n \"prompt\": \"'Clear completed' button\"\n }\n }\n}",
"usage": {
"prompt_tokens": 2027,
"completion_tokens": 118,
"total_tokens": 2145,
"completion_tokens_details": {
"text_tokens": 118
},
"prompt_tokens_details": {
"text_tokens": 829,
"image_tokens": 1198
}
}
},
"response_rect": {
"left": 946,
"top": 387,
"width": 104,
"height": 17
},
"annotation_index_id": 12
}
]
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 302 KiB

After

Width:  |  Height:  |  Size: 304 KiB

View File

@ -79,9 +79,9 @@ describe.skipIf(vlMode)('ai planning - by element', () => {
const vlCases = [
'todo-vl',
'aweme-login-vl',
'antd-form-vl',
'antd-tooltip-vl',
// 'aweme-login-vl',
// 'antd-form-vl',
// 'antd-tooltip-vl',
];
describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
@ -117,6 +117,7 @@ describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
res = await plan(prompt, {
log: testCase.log,
context,
actionContext: testCase.action_context,
});
} catch (error) {
res = error as Error;

View File

@ -20,6 +20,7 @@ export type TestCase = {
response_planning?: PlanningAIResponse;
expected?: boolean;
annotation_index_id?: number;
action_context?: string;
};
export type InspectAiTestCase = {

View File

@ -52,6 +52,7 @@ export interface PageAgentOpt {
/* if auto print report msg, default true */
autoPrintReportMsg?: boolean;
onTaskStartTip?: OnTaskStartTip;
aiActionContext?: string;
}
export class PageAgent<PageType extends WebPage = WebPage> {
@ -119,6 +120,10 @@ export class PageAgent<PageType extends WebPage = WebPage> {
});
}
async setAIActionContext(prompt: string) {
this.opts.aiActionContext = prompt;
}
resetDump() {
this.dump = {
groupName: this.opts.groupName!,
@ -283,7 +288,7 @@ export class PageAgent<PageType extends WebPage = WebPage> {
async aiAction(taskPrompt: string) {
const { executor } = await (getAIConfigInBoolean(MIDSCENE_USE_VLM_UI_TARS)
? this.taskExecutor.actionToGoal(taskPrompt)
: this.taskExecutor.action(taskPrompt));
: this.taskExecutor.action(taskPrompt, this.opts.aiActionContext));
this.afterTaskRunning(executor);
}

View File

@ -511,6 +511,7 @@ export class PageTaskExecutor {
userInstruction: string,
cacheGroup: ReturnType<TaskCache['getCacheGroupByPrompt']>,
log?: string,
actionContext?: string,
) {
const task: ExecutionTaskPlanningApply = {
type: 'Planning',
@ -555,6 +556,7 @@ export class PageTaskExecutor {
planResult = await plan(param.userInstruction, {
context: pageContext,
log: param.log,
actionContext,
});
}
@ -761,14 +763,22 @@ export class PageTaskExecutor {
};
}
async action(userPrompt: string): Promise<ExecutionResult> {
async action(
userPrompt: string,
actionContext?: string,
): Promise<ExecutionResult> {
const taskExecutor = new Executor(taskTitleStr('Action', userPrompt), {
onTaskStart: this.onTaskStartCallback,
});
const cacheGroup = this.taskCache.getCacheGroupByPrompt(userPrompt);
let planningTask: ExecutionTaskPlanningApply | null =
this.planningTaskFromPrompt(userPrompt, cacheGroup);
this.planningTaskFromPrompt(
userPrompt,
cacheGroup,
undefined,
actionContext,
);
let result: any;
let replanCount = 0;
const logList: string[] = [];
@ -825,6 +835,7 @@ export class PageTaskExecutor {
userPrompt,
cacheGroup,
logList.length > 0 ? `- ${logList.join('\n- ')}` : undefined,
actionContext,
);
replanCount++;
}

View File

@ -177,6 +177,7 @@ export async function puppeteerAgentForTarget(
autoPrintReportMsg: false,
testId: preference?.testId,
cacheId: preference?.cacheId,
aiActionContext: target.aiActionContext,
forceSameTabNavigation:
typeof target.forceSameTabNavigation !== 'undefined'
? target.forceSameTabNavigation