mirror of
https://github.com/web-infra-dev/midscene.git
synced 2025-12-27 15:10:20 +00:00
feat: add context for aiAction (#528)
This commit is contained in:
parent
0c0675ba4b
commit
bcdf90b997
@ -15,6 +15,7 @@ These Agents share some common constructor parameters:
|
||||
* `generateReport: boolean`: If true, a report file will be generated. (Default: true)
|
||||
* `autoPrintReportMsg: boolean`: If true, report messages will be printed. (Default: true)
|
||||
* `cacheId: string | undefined`: If provided, this cacheId will be used to save or match the cache. (Default: undefined, means cache feature is disabled)
|
||||
* `actionContext: string`: Some background knowledge that should be sent to the AI model when calling `agent.aiAction()`, like 'close the cookie consent dialog first if it exists' (Default: undefined)
|
||||
|
||||
In Puppeteer, there is an additional parameter:
|
||||
|
||||
@ -366,6 +367,26 @@ console.log(result);
|
||||
For more information about YAML scripts, please refer to [Automate with Scripts in YAML](./automate-with-scripts-in-yaml).
|
||||
:::
|
||||
|
||||
### `agent.setAIActionContext()`
|
||||
|
||||
This method allows you to set the background knowledge that should be sent to the AI model when calling `agent.aiAction()`.
|
||||
|
||||
* Type
|
||||
|
||||
```typescript
|
||||
function setAIActionContext(actionContext: string): void;
|
||||
```
|
||||
|
||||
* Parameters:
|
||||
* `actionContext: string` - The background knowledge that should be sent to the AI model.
|
||||
|
||||
* Example:
|
||||
|
||||
```typescript
|
||||
await agent.setAIActionContext('Close the cookie consent dialog first if it exists');
|
||||
```
|
||||
|
||||
|
||||
## Properties
|
||||
|
||||
### `.reportFile`
|
||||
|
||||
@ -158,6 +158,9 @@ target:
|
||||
|
||||
# boolean, if close the new tabs after the bridge is disconnected, optional, default is false
|
||||
closeNewTabsAfterDisconnect: <boolean>
|
||||
|
||||
# string, the background knowledge to send to the AI model when calling aiAction, optional
|
||||
aiActionContext: <string>
|
||||
```
|
||||
|
||||
### `tasks` part
|
||||
|
||||
@ -14,6 +14,7 @@ Midscene 中每个 Agent 都有自己的构造函数。
|
||||
* `generateReport: boolean`: 如果为 true,则生成报告文件。默认值为 true。
|
||||
* `autoPrintReportMsg: boolean`: 如果为 true,则打印报告消息。默认值为 true。
|
||||
* `cacheId: string | undefined`: 如果配置,则使用此 cacheId 保存或匹配缓存。默认值为 undefined,也就是不启用缓存。
|
||||
* `actionContext: string`: 调用 `agent.aiAction()` 时,发送给 AI 模型的背景知识,比如 '有 cookie 对话框时先关闭它',默认值为空。
|
||||
|
||||
在 puppeteer 中,还有一个额外的参数:
|
||||
|
||||
@ -364,6 +365,25 @@ console.log(result);
|
||||
更多关于 YAML 脚本的信息,请参考 [Automate with Scripts in YAML](./automate-with-scripts-in-yaml)。
|
||||
:::
|
||||
|
||||
### `agent.setAIActionContext()`
|
||||
|
||||
这个方法允许你设置在调用 `agent.aiAction()` 时,发送给 AI 模型的背景知识。
|
||||
|
||||
* 类型
|
||||
|
||||
```typescript
|
||||
function setAIActionContext(actionContext: string): void;
|
||||
```
|
||||
|
||||
* 参数:
|
||||
* `actionContext: string` - 要发送给 AI 模型的背景知识。
|
||||
|
||||
* 示例:
|
||||
|
||||
```typescript
|
||||
await agent.setAIActionContext('如果 “使用cookie” 对话框存在,先关闭它');
|
||||
```
|
||||
|
||||
## 属性
|
||||
|
||||
### `.reportFile`
|
||||
|
||||
@ -158,6 +158,9 @@ target:
|
||||
|
||||
# 是否在桥接断开时关闭新创建的标签页,可选,默认 false
|
||||
closeNewTabsAfterDisconnect: <boolean>
|
||||
|
||||
# 在调用 aiAction 时发送给 AI 模型的背景知识,可选
|
||||
aiActionContext: <string>
|
||||
```
|
||||
|
||||
### `tasks` 部分
|
||||
|
||||
@ -20,6 +20,7 @@ export async function plan(
|
||||
userInstruction: string,
|
||||
opts: {
|
||||
log?: string;
|
||||
actionContext?: string;
|
||||
context: UIContext;
|
||||
callAI?: typeof callAiFn<PlanningAIResponse>;
|
||||
},
|
||||
@ -32,6 +33,7 @@ export async function plan(
|
||||
const taskBackgroundContextText = generateTaskBackgroundContext(
|
||||
userInstruction,
|
||||
opts.log,
|
||||
opts.actionContext,
|
||||
);
|
||||
const userInstructionPrompt = await automationUserPrompt().format({
|
||||
pageDescription,
|
||||
@ -39,7 +41,7 @@ export async function plan(
|
||||
});
|
||||
|
||||
let imagePayload = screenshotBase64WithElementMarker || screenshotBase64;
|
||||
if (vlLocateMode()) {
|
||||
if (vlLocateMode() === 'qwen-vl') {
|
||||
imagePayload = await paddingToMatchBlockByBase64(imagePayload);
|
||||
}
|
||||
|
||||
|
||||
@ -309,12 +309,18 @@ export const planSchema: ResponseFormatJSONSchema = {
|
||||
export const generateTaskBackgroundContext = (
|
||||
userInstruction: string,
|
||||
log?: string,
|
||||
userActionContext?: string,
|
||||
) => {
|
||||
if (log) {
|
||||
return `
|
||||
Here is the user's instruction:
|
||||
|
||||
<instruction>
|
||||
${userInstruction}
|
||||
<high_priority_knowledge>
|
||||
${userActionContext}
|
||||
</high_priority_knowledge>
|
||||
|
||||
${userInstruction}
|
||||
</instruction>
|
||||
|
||||
These are the logs from previous executions, which indicate what was done in the previous actions.
|
||||
@ -328,8 +334,13 @@ ${log}
|
||||
return `
|
||||
Here is the user's instruction:
|
||||
<instruction>
|
||||
${userInstruction}
|
||||
</instruction>`;
|
||||
<high_priority_knowledge>
|
||||
${userActionContext}
|
||||
</high_priority_knowledge>
|
||||
|
||||
${userInstruction}
|
||||
</instruction>
|
||||
`;
|
||||
};
|
||||
|
||||
export const automationUserPrompt = () => {
|
||||
|
||||
@ -27,6 +27,10 @@ export interface MidsceneYamlTask {
|
||||
}
|
||||
|
||||
export interface MidsceneYamlScriptEnv {
|
||||
output?: string;
|
||||
aiActionContext?: string;
|
||||
|
||||
// for web only
|
||||
serve?: string;
|
||||
url: string;
|
||||
|
||||
@ -40,7 +44,6 @@ export interface MidsceneYamlScriptEnv {
|
||||
continueOnNetworkIdleError?: boolean; // should continue if failed to wait for network idle, true for default
|
||||
};
|
||||
cookie?: string;
|
||||
output?: string;
|
||||
forceSameTabNavigation?: boolean; // if track the newly opened tab, true for default in yaml script
|
||||
|
||||
// bridge mode config
|
||||
|
||||
@ -304,8 +304,13 @@ Reason:
|
||||
exports[`system prompts > planning - background context 1`] = `
|
||||
"
|
||||
Here is the user's instruction:
|
||||
|
||||
<instruction>
|
||||
THIS IS USER INSTRUCTION
|
||||
<high_priority_knowledge>
|
||||
THIS IS BACKGROUND PROMPT
|
||||
</high_priority_knowledge>
|
||||
|
||||
THIS IS USER INSTRUCTION
|
||||
</instruction>
|
||||
|
||||
These are the logs from previous executions, which indicate what was done in the previous actions.
|
||||
|
||||
@ -31,6 +31,7 @@ describe('system prompts', () => {
|
||||
const context = generateTaskBackgroundContext(
|
||||
'THIS IS USER INSTRUCTION',
|
||||
'THIS IS WHAT HAS BEEN DONE',
|
||||
'THIS IS BACKGROUND PROMPT',
|
||||
);
|
||||
expect(context).toMatchSnapshot();
|
||||
});
|
||||
@ -42,6 +43,7 @@ describe('system prompts', () => {
|
||||
const result = await prompt.format({
|
||||
pageDescription: 'THIS IS PAGE DESCRIPTION',
|
||||
taskBackgroundContext: 'THIS IS BACKGROUND CONTEXT',
|
||||
userActionContext: 'THIS IS BACKGROUND PROMPT',
|
||||
});
|
||||
expect(result).toMatchSnapshot();
|
||||
});
|
||||
|
||||
@ -1,6 +1,81 @@
|
||||
{
|
||||
"testDataPath": "todo",
|
||||
"testCases": [
|
||||
{
|
||||
"prompt": "type 'hello' in the input box, sleep 5s, hit enter",
|
||||
"response_planning": {
|
||||
"what_the_user_wants_to_do_next_by_instruction": "type 'hello' in the input box",
|
||||
"log": "I will use action Input to type 'hello' in the input box first.",
|
||||
"more_actions_needed_by_instruction": true,
|
||||
"action": {
|
||||
"type": "Input",
|
||||
"locate": {
|
||||
"prompt": "input box for adding a new todo",
|
||||
"bbox": [513, 128, 1067, 197]
|
||||
},
|
||||
"param": {
|
||||
"value": "hello"
|
||||
}
|
||||
},
|
||||
"sleep": 5000,
|
||||
"actions": [
|
||||
{
|
||||
"type": "Input",
|
||||
"locate": {
|
||||
"prompt": "input box for adding a new todo",
|
||||
"bbox": [513, 128, 1067, 197]
|
||||
},
|
||||
"param": {
|
||||
"value": "hello"
|
||||
}
|
||||
}
|
||||
],
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 128,\n 1067,\n 197\n ],\n \"prompt\": \"input box for adding a new todo\"\n },\n \"param\": {\n \"value\": \"hello\"\n }\n },\n \"sleep\": 5000\n}",
|
||||
"usage": {
|
||||
"prompt_tokens": 1979,
|
||||
"completion_tokens": 147,
|
||||
"total_tokens": 2126,
|
||||
"completion_tokens_details": {
|
||||
"text_tokens": 147
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"text_tokens": 781,
|
||||
"image_tokens": 1198
|
||||
}
|
||||
}
|
||||
},
|
||||
"response_bbox": [513, 128, 1067, 194],
|
||||
"annotation_index_id": 1,
|
||||
"response_rect": {
|
||||
"left": 513,
|
||||
"top": 128,
|
||||
"width": 554,
|
||||
"height": 69
|
||||
}
|
||||
},
|
||||
{
|
||||
"prompt": "if there are five entries in the list, click the delete button of the second entry and wait 3s. Otherwise, do nothing.",
|
||||
"response_planning": {
|
||||
"what_the_user_wants_to_do_next_by_instruction": "Check if there are five entries in the list and click the delete button of the second entry if so.",
|
||||
"log": "There are only three entries in the list, so no action is needed according to the instruction.",
|
||||
"more_actions_needed_by_instruction": false,
|
||||
"action": null,
|
||||
"actions": [],
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"Check if there are five entries in the list and click the delete button of the second entry if so.\",\n \"log\": \"There are only three entries in the list, so no action is needed according to the instruction.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": null\n}",
|
||||
"usage": {
|
||||
"prompt_tokens": 1990,
|
||||
"completion_tokens": 82,
|
||||
"total_tokens": 2072,
|
||||
"completion_tokens_details": {
|
||||
"text_tokens": 82
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"text_tokens": 792,
|
||||
"image_tokens": 1198
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"prompt": "type 'hello' in the input box, sleep 5s, hit enter",
|
||||
"response_planning": {
|
||||
@ -32,89 +107,14 @@
|
||||
],
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 128,\n 1067,\n 194\n ],\n \"prompt\": \"input box for adding a new todo\"\n },\n \"param\": {\n \"value\": \"hello\"\n }\n },\n \"sleep\": 5000\n}",
|
||||
"usage": {
|
||||
"prompt_tokens": 1961,
|
||||
"prompt_tokens": 1979,
|
||||
"completion_tokens": 147,
|
||||
"total_tokens": 2108,
|
||||
"total_tokens": 2126,
|
||||
"completion_tokens_details": {
|
||||
"text_tokens": 147
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"text_tokens": 763,
|
||||
"image_tokens": 1198
|
||||
}
|
||||
}
|
||||
},
|
||||
"response_bbox": [513, 128, 1067, 194],
|
||||
"annotation_index_id": 1,
|
||||
"response_rect": {
|
||||
"left": 513,
|
||||
"top": 128,
|
||||
"width": 554,
|
||||
"height": 66
|
||||
}
|
||||
},
|
||||
{
|
||||
"prompt": "if there are five entries in the list, click the delete button of the second entry and wait 3s. Otherwise, do nothing.",
|
||||
"response_planning": {
|
||||
"what_the_user_wants_to_do_next_by_instruction": "Check if there are five entries in the list and click the delete button of the second entry if so.",
|
||||
"log": "There are only three entries in the list, so no action is needed according to the instruction.",
|
||||
"more_actions_needed_by_instruction": false,
|
||||
"action": null,
|
||||
"actions": [],
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"Check if there are five entries in the list and click the delete button of the second entry if so.\",\n \"log\": \"There are only three entries in the list, so no action is needed according to the instruction.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": null\n}",
|
||||
"usage": {
|
||||
"prompt_tokens": 1972,
|
||||
"completion_tokens": 82,
|
||||
"total_tokens": 2054,
|
||||
"completion_tokens_details": {
|
||||
"text_tokens": 82
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"text_tokens": 774,
|
||||
"image_tokens": 1198
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"prompt": "type 'hello' in the input box, sleep 5s, hit enter",
|
||||
"response_planning": {
|
||||
"what_the_user_wants_to_do_next_by_instruction": "type 'hello' in the input box",
|
||||
"log": "I will use action Input to type 'hello' in the input box first.",
|
||||
"more_actions_needed_by_instruction": true,
|
||||
"action": {
|
||||
"type": "Input",
|
||||
"locate": {
|
||||
"prompt": "input box for adding new todo",
|
||||
"bbox": [513, 128, 1067, 194]
|
||||
},
|
||||
"param": {
|
||||
"value": "hello"
|
||||
}
|
||||
},
|
||||
"sleep": 5000,
|
||||
"actions": [
|
||||
{
|
||||
"type": "Input",
|
||||
"locate": {
|
||||
"prompt": "input box for adding new todo",
|
||||
"bbox": [513, 128, 1067, 194]
|
||||
},
|
||||
"param": {
|
||||
"value": "hello"
|
||||
}
|
||||
}
|
||||
],
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 128,\n 1067,\n 194\n ],\n \"prompt\": \"input box for adding new todo\"\n },\n \"param\": {\n \"value\": \"hello\"\n }\n },\n \"sleep\": 5000\n}",
|
||||
"usage": {
|
||||
"prompt_tokens": 1961,
|
||||
"completion_tokens": 146,
|
||||
"total_tokens": 2107,
|
||||
"completion_tokens_details": {
|
||||
"text_tokens": 146
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"text_tokens": 763,
|
||||
"text_tokens": 781,
|
||||
"image_tokens": 1198
|
||||
}
|
||||
}
|
||||
@ -150,16 +150,16 @@
|
||||
}
|
||||
}
|
||||
],
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"hit enter\",\n \"log\": \"I will use action KeyboardPress to hit the Enter key.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"KeyboardPress\",\n \"param\": {\n \"value\": \"Enter\"\n }\n },\n \"sleep\": null\n}",
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"hit enter\",\n \"log\": \"I will use action KeyboardPress to hit the Enter key.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"KeyboardPress\",\n \"param\": {\n \"value\": \"Enter\"\n }\n },\n \"sleep\": 5000\n}",
|
||||
"usage": {
|
||||
"prompt_tokens": 2007,
|
||||
"completion_tokens": 86,
|
||||
"total_tokens": 2093,
|
||||
"prompt_tokens": 2025,
|
||||
"completion_tokens": 90,
|
||||
"total_tokens": 2115,
|
||||
"completion_tokens_details": {
|
||||
"text_tokens": 86
|
||||
"text_tokens": 90
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"text_tokens": 809,
|
||||
"text_tokens": 827,
|
||||
"image_tokens": 1198
|
||||
}
|
||||
}
|
||||
@ -175,7 +175,7 @@
|
||||
"type": "Input",
|
||||
"locate": {
|
||||
"prompt": "input box for adding a new todo",
|
||||
"bbox": [513, 128, 1067, 194]
|
||||
"bbox": [513, 127, 1068, 194]
|
||||
},
|
||||
"param": {
|
||||
"value": "hello"
|
||||
@ -186,23 +186,23 @@
|
||||
"type": "Input",
|
||||
"locate": {
|
||||
"prompt": "input box for adding a new todo",
|
||||
"bbox": [513, 128, 1067, 194]
|
||||
"bbox": [513, 127, 1068, 194]
|
||||
},
|
||||
"param": {
|
||||
"value": "hello"
|
||||
}
|
||||
}
|
||||
],
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 128,\n 1067,\n 194\n ],\n \"prompt\": \"input box for adding a new todo\"\n },\n \"param\": {\n \"value\": \"hello\"\n }\n }\n}",
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 127,\n 1068,\n 194\n ],\n \"prompt\": \"input box for adding a new todo\"\n },\n \"param\": {\n \"value\": \"hello\"\n }\n }\n}",
|
||||
"usage": {
|
||||
"prompt_tokens": 1953,
|
||||
"prompt_tokens": 1971,
|
||||
"completion_tokens": 137,
|
||||
"total_tokens": 2090,
|
||||
"total_tokens": 2108,
|
||||
"completion_tokens_details": {
|
||||
"text_tokens": 137
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"text_tokens": 755,
|
||||
"text_tokens": 773,
|
||||
"image_tokens": 1198
|
||||
}
|
||||
}
|
||||
@ -211,15 +211,15 @@
|
||||
"annotation_index_id": 5,
|
||||
"response_rect": {
|
||||
"left": 513,
|
||||
"top": 128,
|
||||
"width": 554,
|
||||
"height": 66
|
||||
"top": 127,
|
||||
"width": 555,
|
||||
"height": 67
|
||||
}
|
||||
},
|
||||
{
|
||||
"prompt": "click OK",
|
||||
"response_planning": {
|
||||
"error": "Failed to plan actions: No 'OK' button found in the current view."
|
||||
"error": "Failed to plan actions: There is no visible 'OK' button in the screenshot provided."
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -233,14 +233,14 @@
|
||||
"actions": [],
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"check if there is an 'OK' button and click it\",\n \"log\": \"There is no 'OK' button visible in the screenshot. No action needed.\",\n \"error\": null,\n \"more_actions_needed_by_instruction\": false,\n \"action\": null\n}",
|
||||
"usage": {
|
||||
"prompt_tokens": 1956,
|
||||
"prompt_tokens": 1974,
|
||||
"completion_tokens": 73,
|
||||
"total_tokens": 2029,
|
||||
"total_tokens": 2047,
|
||||
"completion_tokens_details": {
|
||||
"text_tokens": 73
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"text_tokens": 758,
|
||||
"text_tokens": 776,
|
||||
"image_tokens": 1198
|
||||
}
|
||||
}
|
||||
@ -250,22 +250,22 @@
|
||||
"prompt": "if there is an 'OK' button, click it. If not, try again in next step",
|
||||
"response_planning": {
|
||||
"what_the_user_wants_to_do_next_by_instruction": "check if there is an 'OK' button and click it if found",
|
||||
"log": "I will check the screenshot for an 'OK' button. If not found, I will wait for the next step.",
|
||||
"log": "I will check the screenshot for an 'OK' button. If not found, I will log that no action should be taken.",
|
||||
"error": null,
|
||||
"more_actions_needed_by_instruction": true,
|
||||
"action": null,
|
||||
"sleep": 1000,
|
||||
"sleep": null,
|
||||
"actions": [],
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"check if there is an 'OK' button and click it if found\",\n \"log\": \"I will check the screenshot for an 'OK' button. If not found, I will wait for the next step.\",\n \"error\": null,\n \"more_actions_needed_by_instruction\": true,\n \"action\": null,\n \"sleep\": 1000\n}",
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"check if there is an 'OK' button and click it if found\",\n \"log\": \"I will check the screenshot for an 'OK' button. If not found, I will log that no action should be taken.\",\n \"error\": null,\n \"more_actions_needed_by_instruction\": false,\n \"action\": null,\n \"sleep\": null\n}",
|
||||
"usage": {
|
||||
"prompt_tokens": 1965,
|
||||
"completion_tokens": 97,
|
||||
"total_tokens": 2062,
|
||||
"prompt_tokens": 1983,
|
||||
"completion_tokens": 95,
|
||||
"total_tokens": 2078,
|
||||
"completion_tokens_details": {
|
||||
"text_tokens": 97
|
||||
"text_tokens": 95
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"text_tokens": 767,
|
||||
"text_tokens": 785,
|
||||
"image_tokens": 1198
|
||||
}
|
||||
}
|
||||
@ -276,35 +276,33 @@
|
||||
"response_planning": {
|
||||
"what_the_user_wants_to_do_next_by_instruction": "Move the mouse to the second item in the task list and click the delete button on the right of the second task",
|
||||
"log": "I will use action Hover to move the mouse to the second item in the task list first.",
|
||||
"error": null,
|
||||
"more_actions_needed_by_instruction": true,
|
||||
"action": {
|
||||
"type": "Hover",
|
||||
"locate": {
|
||||
"prompt": "second item in the task list",
|
||||
"bbox": [574, 269, 693, 298]
|
||||
"bbox": [574, 276, 693, 296]
|
||||
}
|
||||
},
|
||||
"sleep": 200,
|
||||
"actions": [
|
||||
{
|
||||
"type": "Hover",
|
||||
"locate": {
|
||||
"prompt": "second item in the task list",
|
||||
"bbox": [574, 269, 693, 298]
|
||||
"bbox": [574, 276, 693, 296]
|
||||
}
|
||||
}
|
||||
],
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"Move the mouse to the second item in the task list and click the delete button on the right of the second task\",\n \"log\": \"I will use action Hover to move the mouse to the second item in the task list first.\",\n \"error\": null,\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Hover\",\n \"locate\": {\n \"bbox_2d\": [\n 574,\n 269,\n 693,\n 298\n ],\n \"prompt\": \"second item in the task list\"\n }\n },\n \"sleep\": 200\n}",
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"Move the mouse to the second item in the task list and click the delete button on the right of the second task\",\n \"log\": \"I will use action Hover to move the mouse to the second item in the task list first.\",\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Hover\",\n \"locate\": {\n \"bbox_2d\": [\n 574,\n 276,\n 693,\n 296\n ],\n \"prompt\": \"second item in the task list\"\n }\n }\n}",
|
||||
"usage": {
|
||||
"prompt_tokens": 1968,
|
||||
"completion_tokens": 154,
|
||||
"total_tokens": 2122,
|
||||
"prompt_tokens": 1986,
|
||||
"completion_tokens": 139,
|
||||
"total_tokens": 2125,
|
||||
"completion_tokens_details": {
|
||||
"text_tokens": 154
|
||||
"text_tokens": 139
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"text_tokens": 770,
|
||||
"text_tokens": 788,
|
||||
"image_tokens": 1198
|
||||
}
|
||||
}
|
||||
@ -313,22 +311,22 @@
|
||||
"annotation_index_id": 9,
|
||||
"response_rect": {
|
||||
"left": 574,
|
||||
"top": 269,
|
||||
"top": 276,
|
||||
"width": 119,
|
||||
"height": 29
|
||||
"height": 20
|
||||
}
|
||||
},
|
||||
{
|
||||
"prompt": "在任务框 input 输入 今天学习 JS,按回车键",
|
||||
"response_planning": {
|
||||
"what_the_user_wants_to_do_next_by_instruction": "在任务框 input 输入 今天学习 JS,按回车键",
|
||||
"log": "我将使用 Input 动作来填写任务框中的内容。",
|
||||
"log": "我将使用 Input 动作来填写任务框,并输入 '今天学习 JS'。",
|
||||
"more_actions_needed_by_instruction": true,
|
||||
"action": {
|
||||
"type": "Input",
|
||||
"locate": {
|
||||
"prompt": "任务输入框",
|
||||
"bbox": [513, 127, 1068, 197]
|
||||
"prompt": "任务框的输入区域",
|
||||
"bbox": [513, 128, 1067, 194]
|
||||
},
|
||||
"param": {
|
||||
"value": "今天学习 JS"
|
||||
@ -338,24 +336,24 @@
|
||||
{
|
||||
"type": "Input",
|
||||
"locate": {
|
||||
"prompt": "任务输入框",
|
||||
"bbox": [513, 127, 1068, 197]
|
||||
"prompt": "任务框的输入区域",
|
||||
"bbox": [513, 128, 1067, 194]
|
||||
},
|
||||
"param": {
|
||||
"value": "今天学习 JS"
|
||||
}
|
||||
}
|
||||
],
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"在任务框 input 输入 今天学习 JS,按回车键\",\n \"log\": \"我将使用 Input 动作来填写任务框中的内容。\",\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 127,\n 1068,\n 197\n ],\n \"prompt\": \"任务输入框\"\n },\n \"param\": {\n \"value\": \"今天学习 JS\"\n }\n }\n}",
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"在任务框 input 输入 今天学习 JS,按回车键\",\n \"log\": \"我将使用 Input 动作来填写任务框,并输入 '今天学习 JS'。\",\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 513,\n 128,\n 1067,\n 194\n ],\n \"prompt\": \"任务框的输入区域\"\n },\n \"param\": {\n \"value\": \"今天学习 JS\"\n }\n }\n}",
|
||||
"usage": {
|
||||
"prompt_tokens": 1959,
|
||||
"completion_tokens": 139,
|
||||
"total_tokens": 2098,
|
||||
"prompt_tokens": 1977,
|
||||
"completion_tokens": 146,
|
||||
"total_tokens": 2123,
|
||||
"completion_tokens_details": {
|
||||
"text_tokens": 139
|
||||
"text_tokens": 146
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"text_tokens": 761,
|
||||
"text_tokens": 779,
|
||||
"image_tokens": 1198
|
||||
}
|
||||
}
|
||||
@ -364,10 +362,103 @@
|
||||
"annotation_index_id": 10,
|
||||
"response_rect": {
|
||||
"left": 513,
|
||||
"top": 127,
|
||||
"width": 555,
|
||||
"height": 70
|
||||
"top": 128,
|
||||
"width": 554,
|
||||
"height": 66
|
||||
}
|
||||
},
|
||||
{
|
||||
"prompt": "Click the 'clear completed' button",
|
||||
"action_context": "Always click the input box first before doing anything else",
|
||||
"response_planning": {
|
||||
"what_the_user_wants_to_do_next_by_instruction": "Click the 'clear completed' button",
|
||||
"log": "I will use action Tap to click the input box first before clicking the 'clear completed' button.",
|
||||
"error": null,
|
||||
"more_actions_needed_by_instruction": true,
|
||||
"action": {
|
||||
"type": "Tap",
|
||||
"locate": {
|
||||
"prompt": "input box",
|
||||
"bbox": [574, 156, 839, 176]
|
||||
}
|
||||
},
|
||||
"sleep": 500,
|
||||
"actions": [
|
||||
{
|
||||
"type": "Tap",
|
||||
"locate": {
|
||||
"prompt": "input box",
|
||||
"bbox": [574, 156, 839, 176]
|
||||
}
|
||||
}
|
||||
],
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"Click the 'clear completed' button\",\n \"log\": \"I will use action Tap to click the input box first before clicking the 'clear completed' button.\",\n \"error\": null,\n \"more_actions_needed_by_instruction\": true,\n \"action\": {\n \"type\": \"Tap\",\n \"locate\": {\n \"bbox_2d\": [\n 574,\n 156,\n 839,\n 176\n ],\n \"prompt\": \"input box\"\n }\n },\n \"sleep\": 500\n}",
|
||||
"usage": {
|
||||
"prompt_tokens": 1979,
|
||||
"completion_tokens": 135,
|
||||
"total_tokens": 2114,
|
||||
"completion_tokens_details": {
|
||||
"text_tokens": 135
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"text_tokens": 781,
|
||||
"image_tokens": 1198
|
||||
}
|
||||
}
|
||||
},
|
||||
"response_rect": {
|
||||
"left": 574,
|
||||
"top": 156,
|
||||
"width": 265,
|
||||
"height": 20
|
||||
},
|
||||
"annotation_index_id": 11
|
||||
},
|
||||
{
|
||||
"prompt": "Click the 'clear completed' button",
|
||||
"action_context": "Always click the input box first before doing anything else",
|
||||
"log": "I will use action Tap to click the input box first before doing anything else.",
|
||||
"response_planning": {
|
||||
"what_the_user_wants_to_do_next_by_instruction": "Click the 'clear completed' button",
|
||||
"log": "I will use action Tap to click the 'clear completed' button next.",
|
||||
"more_actions_needed_by_instruction": false,
|
||||
"action": {
|
||||
"type": "Tap",
|
||||
"locate": {
|
||||
"prompt": "'Clear completed' button",
|
||||
"bbox": [946, 387, 1050, 404]
|
||||
}
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "Tap",
|
||||
"locate": {
|
||||
"prompt": "'Clear completed' button",
|
||||
"bbox": [946, 387, 1050, 404]
|
||||
}
|
||||
}
|
||||
],
|
||||
"rawResponse": "{\n \"what_the_user_wants_to_do_next_by_instruction\": \"Click the 'clear completed' button\",\n \"log\": \"I will use action Tap to click the 'clear completed' button next.\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Tap\",\n \"locate\": {\n \"bbox_2d\": [\n 946,\n 387,\n 1050,\n 404\n ],\n \"prompt\": \"'Clear completed' button\"\n }\n }\n}",
|
||||
"usage": {
|
||||
"prompt_tokens": 2027,
|
||||
"completion_tokens": 118,
|
||||
"total_tokens": 2145,
|
||||
"completion_tokens_details": {
|
||||
"text_tokens": 118
|
||||
},
|
||||
"prompt_tokens_details": {
|
||||
"text_tokens": 829,
|
||||
"image_tokens": 1198
|
||||
}
|
||||
}
|
||||
},
|
||||
"response_rect": {
|
||||
"left": 946,
|
||||
"top": 387,
|
||||
"width": 104,
|
||||
"height": 17
|
||||
},
|
||||
"annotation_index_id": 12
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 302 KiB After Width: | Height: | Size: 304 KiB |
@ -79,9 +79,9 @@ describe.skipIf(vlMode)('ai planning - by element', () => {
|
||||
|
||||
const vlCases = [
|
||||
'todo-vl',
|
||||
'aweme-login-vl',
|
||||
'antd-form-vl',
|
||||
'antd-tooltip-vl',
|
||||
// 'aweme-login-vl',
|
||||
// 'antd-form-vl',
|
||||
// 'antd-tooltip-vl',
|
||||
];
|
||||
|
||||
describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
|
||||
@ -117,6 +117,7 @@ describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
|
||||
res = await plan(prompt, {
|
||||
log: testCase.log,
|
||||
context,
|
||||
actionContext: testCase.action_context,
|
||||
});
|
||||
} catch (error) {
|
||||
res = error as Error;
|
||||
|
||||
@ -20,6 +20,7 @@ export type TestCase = {
|
||||
response_planning?: PlanningAIResponse;
|
||||
expected?: boolean;
|
||||
annotation_index_id?: number;
|
||||
action_context?: string;
|
||||
};
|
||||
|
||||
export type InspectAiTestCase = {
|
||||
|
||||
@ -52,6 +52,7 @@ export interface PageAgentOpt {
|
||||
/* if auto print report msg, default true */
|
||||
autoPrintReportMsg?: boolean;
|
||||
onTaskStartTip?: OnTaskStartTip;
|
||||
aiActionContext?: string;
|
||||
}
|
||||
|
||||
export class PageAgent<PageType extends WebPage = WebPage> {
|
||||
@ -119,6 +120,10 @@ export class PageAgent<PageType extends WebPage = WebPage> {
|
||||
});
|
||||
}
|
||||
|
||||
async setAIActionContext(prompt: string) {
|
||||
this.opts.aiActionContext = prompt;
|
||||
}
|
||||
|
||||
resetDump() {
|
||||
this.dump = {
|
||||
groupName: this.opts.groupName!,
|
||||
@ -283,7 +288,7 @@ export class PageAgent<PageType extends WebPage = WebPage> {
|
||||
async aiAction(taskPrompt: string) {
|
||||
const { executor } = await (getAIConfigInBoolean(MIDSCENE_USE_VLM_UI_TARS)
|
||||
? this.taskExecutor.actionToGoal(taskPrompt)
|
||||
: this.taskExecutor.action(taskPrompt));
|
||||
: this.taskExecutor.action(taskPrompt, this.opts.aiActionContext));
|
||||
|
||||
this.afterTaskRunning(executor);
|
||||
}
|
||||
|
||||
@ -511,6 +511,7 @@ export class PageTaskExecutor {
|
||||
userInstruction: string,
|
||||
cacheGroup: ReturnType<TaskCache['getCacheGroupByPrompt']>,
|
||||
log?: string,
|
||||
actionContext?: string,
|
||||
) {
|
||||
const task: ExecutionTaskPlanningApply = {
|
||||
type: 'Planning',
|
||||
@ -555,6 +556,7 @@ export class PageTaskExecutor {
|
||||
planResult = await plan(param.userInstruction, {
|
||||
context: pageContext,
|
||||
log: param.log,
|
||||
actionContext,
|
||||
});
|
||||
}
|
||||
|
||||
@ -761,14 +763,22 @@ export class PageTaskExecutor {
|
||||
};
|
||||
}
|
||||
|
||||
async action(userPrompt: string): Promise<ExecutionResult> {
|
||||
async action(
|
||||
userPrompt: string,
|
||||
actionContext?: string,
|
||||
): Promise<ExecutionResult> {
|
||||
const taskExecutor = new Executor(taskTitleStr('Action', userPrompt), {
|
||||
onTaskStart: this.onTaskStartCallback,
|
||||
});
|
||||
|
||||
const cacheGroup = this.taskCache.getCacheGroupByPrompt(userPrompt);
|
||||
let planningTask: ExecutionTaskPlanningApply | null =
|
||||
this.planningTaskFromPrompt(userPrompt, cacheGroup);
|
||||
this.planningTaskFromPrompt(
|
||||
userPrompt,
|
||||
cacheGroup,
|
||||
undefined,
|
||||
actionContext,
|
||||
);
|
||||
let result: any;
|
||||
let replanCount = 0;
|
||||
const logList: string[] = [];
|
||||
@ -825,6 +835,7 @@ export class PageTaskExecutor {
|
||||
userPrompt,
|
||||
cacheGroup,
|
||||
logList.length > 0 ? `- ${logList.join('\n- ')}` : undefined,
|
||||
actionContext,
|
||||
);
|
||||
replanCount++;
|
||||
}
|
||||
|
||||
@ -177,6 +177,7 @@ export async function puppeteerAgentForTarget(
|
||||
autoPrintReportMsg: false,
|
||||
testId: preference?.testId,
|
||||
cacheId: preference?.cacheId,
|
||||
aiActionContext: target.aiActionContext,
|
||||
forceSameTabNavigation:
|
||||
typeof target.forceSameTabNavigation !== 'undefined'
|
||||
? target.forceSameTabNavigation
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user