feat: add context for aiAction (#528)

2025-12-27 15:10:20 +00:00 · 2025-04-02 20:34:23 +08:00 · 2025-04-02 20:34:23 +08:00 · bcdf90b997
commit bcdf90b997
parent 0c0675ba4b
16 changed files with 323 additions and 143 deletions
--- a/apps/site/docs/en/API.mdx
+++ b/apps/site/docs/en/API.mdx
@ -15,6 +15,7 @@ These Agents share some common constructor parameters:
 * `generateReport: boolean`: If true, a report file will be generated. (Default: true)
 * `autoPrintReportMsg: boolean`: If true, report messages will be printed. (Default: true)
 * `cacheId: string | undefined`: If provided, this cacheId will be used to save or match the cache. (Default: undefined, means cache feature is disabled)
+* `actionContext: string`: Some background knowledge that should be sent to the AI model when calling `agent.aiAction()`, like 'close the cookie consent dialog first if it exists' (Default: undefined)

 In Puppeteer, there is an additional parameter:

@ -366,6 +367,26 @@ console.log(result);
 For more information about YAML scripts, please refer to [Automate with Scripts in YAML](./automate-with-scripts-in-yaml).
 :::

+### `agent.setAIActionContext()`
+
+This method allows you to set the background knowledge that should be sent to the AI model when calling `agent.aiAction()`.
+
+* Type
+
+```typescript
+function setAIActionContext(actionContext: string): void;
+```
+
+* Parameters:
+  * `actionContext: string` - The background knowledge that should be sent to the AI model.
+
+* Example:
+
+```typescript
+await agent.setAIActionContext('Close the cookie consent dialog first if it exists');
+```
+
+
 ## Properties

 ### `.reportFile`
--- a/apps/site/docs/en/automate-with-scripts-in-yaml.mdx
+++ b/apps/site/docs/en/automate-with-scripts-in-yaml.mdx
@ -158,6 +158,9 @@ target:

  # boolean, if close the new tabs after the bridge is disconnected, optional, default is false
  closeNewTabsAfterDisconnect: <boolean>
+
+  # string, the background knowledge to send to the AI model when calling aiAction, optional
+  aiActionContext: <string>
 ```

 ### `tasks` part
--- a/apps/site/docs/zh/API.mdx
+++ b/apps/site/docs/zh/API.mdx
@ -14,6 +14,7 @@ Midscene 中每个 Agent 都有自己的构造函数。
 * `generateReport: boolean`: 如果为 true，则生成报告文件。默认值为 true。
 * `autoPrintReportMsg: boolean`: 如果为 true，则打印报告消息。默认值为 true。
 * `cacheId: string | undefined`: 如果配置，则使用此 cacheId 保存或匹配缓存。默认值为 undefined，也就是不启用缓存。
+* `actionContext: string`: 调用 `agent.aiAction()` 时，发送给 AI 模型的背景知识，比如 '有 cookie 对话框时先关闭它'，默认值为空。

 在 puppeteer 中，还有一个额外的参数：

@ -364,6 +365,25 @@ console.log(result);
 更多关于 YAML 脚本的信息，请参考 [Automate with Scripts in YAML](./automate-with-scripts-in-yaml)。
 :::

+### `agent.setAIActionContext()`
+
+这个方法允许你设置在调用 `agent.aiAction()` 时，发送给 AI 模型的背景知识。
+
+* 类型
+
+```typescript
+function setAIActionContext(actionContext: string): void;
+```
+
+* 参数：
+  * `actionContext: string` - 要发送给 AI 模型的背景知识。
+
+* 示例：
+
+```typescript
+await agent.setAIActionContext('如果 “使用cookie” 对话框存在，先关闭它');
+```
+
 ## 属性

 ### `.reportFile`
--- a/apps/site/docs/zh/automate-with-scripts-in-yaml.mdx
+++ b/apps/site/docs/zh/automate-with-scripts-in-yaml.mdx
@ -158,6 +158,9 @@ target:

  # 是否在桥接断开时关闭新创建的标签页，可选，默认 false
  closeNewTabsAfterDisconnect: <boolean>
+
+  # 在调用 aiAction 时发送给 AI 模型的背景知识，可选
+  aiActionContext: <string>
 ```

 ### `tasks` 部分
--- a/packages/core/src/ai-model/llm-planning.ts
+++ b/packages/core/src/ai-model/llm-planning.ts
@ -20,6 +20,7 @@ export async function plan(
  userInstruction: string,
  opts: {
    log?: string;
+    actionContext?: string;
    context: UIContext;
    callAI?: typeof callAiFn<PlanningAIResponse>;
  },
@ -32,6 +33,7 @@ export async function plan(
  const taskBackgroundContextText = generateTaskBackgroundContext(
    userInstruction,
    opts.log,
+    opts.actionContext,
  );
  const userInstructionPrompt = await automationUserPrompt().format({
    pageDescription,
@ -39,7 +41,7 @@ export async function plan(
  });

  let imagePayload = screenshotBase64WithElementMarker || screenshotBase64;
-  if (vlLocateMode()) {
+  if (vlLocateMode() === 'qwen-vl') {
    imagePayload = await paddingToMatchBlockByBase64(imagePayload);
  }

--- a/packages/core/src/ai-model/prompt/llm-planning.ts
+++ b/packages/core/src/ai-model/prompt/llm-planning.ts
@ -309,12 +309,18 @@ export const planSchema: ResponseFormatJSONSchema = {
 export const generateTaskBackgroundContext = (
  userInstruction: string,
  log?: string,
+  userActionContext?: string,
 ) => {
  if (log) {
    return `
 Here is the user's instruction:
+
 <instruction>
-${userInstruction}
+  <high_priority_knowledge>
+    ${userActionContext}
+  </high_priority_knowledge>
+
+  ${userInstruction}
 </instruction>

 These are the logs from previous executions, which indicate what was done in the previous actions.
@ -328,8 +334,13 @@ ${log}
  return `
 Here is the user's instruction:
 <instruction>
-${userInstruction}
-</instruction>`;
+  <high_priority_knowledge>
+    ${userActionContext}
+  </high_priority_knowledge>
+
+  ${userInstruction}
+</instruction>
+`;
 };

 export const automationUserPrompt = () => {
--- a/packages/core/src/yaml.ts
+++ b/packages/core/src/yaml.ts
@ -27,6 +27,10 @@ export interface MidsceneYamlTask {
 }

 export interface MidsceneYamlScriptEnv {
+  output?: string;
+  aiActionContext?: string;
+
+  // for web only
  serve?: string;
  url: string;

@ -40,7 +44,6 @@ export interface MidsceneYamlScriptEnv {
    continueOnNetworkIdleError?: boolean; // should continue if failed to wait for network idle, true for default
  };
  cookie?: string;
-  output?: string;
  forceSameTabNavigation?: boolean; // if track the newly opened tab, true for default in yaml script

  // bridge mode config
--- a/packages/core/tests/unit-test/prompt/snapshots/prompt.test.ts.snap
+++ b/packages/core/tests/unit-test/prompt/snapshots/prompt.test.ts.snap
@ -304,8 +304,13 @@ Reason:
 exports[`system prompts > planning - background context 1`] = `
 "
 Here is the user's instruction:
+
 <instruction>
-THIS IS USER INSTRUCTION
+  <high_priority_knowledge>
+    THIS IS BACKGROUND PROMPT
+  </high_priority_knowledge>
+
+  THIS IS USER INSTRUCTION
 </instruction>

 These are the logs from previous executions, which indicate what was done in the previous actions.
--- a/packages/core/tests/unit-test/prompt/prompt.test.ts
+++ b/packages/core/tests/unit-test/prompt/prompt.test.ts
@ -31,6 +31,7 @@ describe('system prompts', () => {
    const context = generateTaskBackgroundContext(
      'THIS IS USER INSTRUCTION',
      'THIS IS WHAT HAS BEEN DONE',
+      'THIS IS BACKGROUND PROMPT',
    );
    expect(context).toMatchSnapshot();
  });
@ -42,6 +43,7 @@ describe('system prompts', () => {
    const result = await prompt.format({
      pageDescription: 'THIS IS PAGE DESCRIPTION',
      taskBackgroundContext: 'THIS IS BACKGROUND CONTEXT',
+      userActionContext: 'THIS IS BACKGROUND PROMPT',
    });
    expect(result).toMatchSnapshot();
  });
--- a/packages/evaluation/page-cases/planning/todo-vl.json
+++ b/packages/evaluation/page-cases/planning/todo-vl.json
@ -1,6 +1,81 @@
 {
  "testDataPath": "todo",
  "testCases": [
+    {
+      "prompt": "type 'hello' in the input box, sleep 5s, hit enter",
+      "response_planning": {
+        "what_the_user_wants_to_do_next_by_instruction": "type 'hello' in the input box",
+        "log": "I will use action Input to type 'hello' in the input box first.",
+        "more_actions_needed_by_instruction": true,
+        "action": {
+          "type": "Input",
+          "locate": {
+            "prompt": "input box for adding a new todo",
+            "bbox": [513, 128, 1067, 197]
+          },
+          "param": {
+            "value": "hello"
+          }
+        },
+        "sleep": 5000,
+        "actions": [
+          {
+            "type": "Input",
+            "locate": {
+              "prompt": "input box for adding a new todo",
+              "bbox": [513, 128, 1067, 197]
+            },
+            "param": {
+              "value": "hello"
+            }
+          }
+        ],
+        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n  \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n  \"more_actions_needed_by_instruction\": true,\n  \"action\": {\n    \"type\": \"Input\",\n    \"locate\": {\n      \"bbox_2d\": [\n        513,\n        128,\n        1067,\n        197\n      ],\n      \"prompt\": \"input box for adding a new todo\"\n    },\n    \"param\": {\n      \"value\": \"hello\"\n    }\n  },\n  \"sleep\": 5000\n}",
+        "usage": {
+          "prompt_tokens": 1979,
+          "completion_tokens": 147,
+          "total_tokens": 2126,
+          "completion_tokens_details": {
+            "text_tokens": 147
+          },
+          "prompt_tokens_details": {
+            "text_tokens": 781,
+            "image_tokens": 1198
+          }
+        }
+      },
+      "response_bbox": [513, 128, 1067, 194],
+      "annotation_index_id": 1,
+      "response_rect": {
+        "left": 513,
+        "top": 128,
+        "width": 554,
+        "height": 69
+      }
+    },
+    {
+      "prompt": "if there are five entries in the list, click the delete button of the second entry and wait 3s. Otherwise, do nothing.",
+      "response_planning": {
+        "what_the_user_wants_to_do_next_by_instruction": "Check if there are five entries in the list and click the delete button of the second entry if so.",
+        "log": "There are only three entries in the list, so no action is needed according to the instruction.",
+        "more_actions_needed_by_instruction": false,
+        "action": null,
+        "actions": [],
+        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"Check if there are five entries in the list and click the delete button of the second entry if so.\",\n  \"log\": \"There are only three entries in the list, so no action is needed according to the instruction.\",\n  \"more_actions_needed_by_instruction\": false,\n  \"action\": null\n}",
+        "usage": {
+          "prompt_tokens": 1990,
+          "completion_tokens": 82,
+          "total_tokens": 2072,
+          "completion_tokens_details": {
+            "text_tokens": 82
+          },
+          "prompt_tokens_details": {
+            "text_tokens": 792,
+            "image_tokens": 1198
+          }
+        }
+      }
+    },
    {
      "prompt": "type 'hello' in the input box, sleep 5s, hit enter",
      "response_planning": {
@ -32,89 +107,14 @@
        ],
        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n  \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n  \"more_actions_needed_by_instruction\": true,\n  \"action\": {\n    \"type\": \"Input\",\n    \"locate\": {\n      \"bbox_2d\": [\n        513,\n        128,\n        1067,\n        194\n      ],\n      \"prompt\": \"input box for adding a new todo\"\n    },\n    \"param\": {\n      \"value\": \"hello\"\n    }\n  },\n  \"sleep\": 5000\n}",
        "usage": {
-          "prompt_tokens": 1961,
+          "prompt_tokens": 1979,
          "completion_tokens": 147,
-          "total_tokens": 2108,
+          "total_tokens": 2126,
          "completion_tokens_details": {
            "text_tokens": 147
          },
          "prompt_tokens_details": {
-            "text_tokens": 763,
-            "image_tokens": 1198
-          }
-        }
-      },
-      "response_bbox": [513, 128, 1067, 194],
-      "annotation_index_id": 1,
-      "response_rect": {
-        "left": 513,
-        "top": 128,
-        "width": 554,
-        "height": 66
-      }
-    },
-    {
-      "prompt": "if there are five entries in the list, click the delete button of the second entry and wait 3s. Otherwise, do nothing.",
-      "response_planning": {
-        "what_the_user_wants_to_do_next_by_instruction": "Check if there are five entries in the list and click the delete button of the second entry if so.",
-        "log": "There are only three entries in the list, so no action is needed according to the instruction.",
-        "more_actions_needed_by_instruction": false,
-        "action": null,
-        "actions": [],
-        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"Check if there are five entries in the list and click the delete button of the second entry if so.\",\n  \"log\": \"There are only three entries in the list, so no action is needed according to the instruction.\",\n  \"more_actions_needed_by_instruction\": false,\n  \"action\": null\n}",
-        "usage": {
-          "prompt_tokens": 1972,
-          "completion_tokens": 82,
-          "total_tokens": 2054,
-          "completion_tokens_details": {
-            "text_tokens": 82
-          },
-          "prompt_tokens_details": {
-            "text_tokens": 774,
-            "image_tokens": 1198
-          }
-        }
-      }
-    },
-    {
-      "prompt": "type 'hello' in the input box, sleep 5s, hit enter",
-      "response_planning": {
-        "what_the_user_wants_to_do_next_by_instruction": "type 'hello' in the input box",
-        "log": "I will use action Input to type 'hello' in the input box first.",
-        "more_actions_needed_by_instruction": true,
-        "action": {
-          "type": "Input",
-          "locate": {
-            "prompt": "input box for adding new todo",
-            "bbox": [513, 128, 1067, 194]
-          },
-          "param": {
-            "value": "hello"
-          }
-        },
-        "sleep": 5000,
-        "actions": [
-          {
-            "type": "Input",
-            "locate": {
-              "prompt": "input box for adding new todo",
-              "bbox": [513, 128, 1067, 194]
-            },
-            "param": {
-              "value": "hello"
-            }
-          }
-        ],
-        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n  \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n  \"more_actions_needed_by_instruction\": true,\n  \"action\": {\n    \"type\": \"Input\",\n    \"locate\": {\n      \"bbox_2d\": [\n        513,\n        128,\n        1067,\n        194\n      ],\n      \"prompt\": \"input box for adding new todo\"\n    },\n    \"param\": {\n      \"value\": \"hello\"\n    }\n  },\n  \"sleep\": 5000\n}",
-        "usage": {
-          "prompt_tokens": 1961,
-          "completion_tokens": 146,
-          "total_tokens": 2107,
-          "completion_tokens_details": {
-            "text_tokens": 146
-          },
-          "prompt_tokens_details": {
-            "text_tokens": 763,
+            "text_tokens": 781,
            "image_tokens": 1198
          }
        }
@ -150,16 +150,16 @@
            }
          }
        ],
-        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"hit enter\",\n  \"log\": \"I will use action KeyboardPress to hit the Enter key.\",\n  \"more_actions_needed_by_instruction\": false,\n  \"action\": {\n    \"type\": \"KeyboardPress\",\n    \"param\": {\n      \"value\": \"Enter\"\n    }\n  },\n  \"sleep\": null\n}",
+        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"hit enter\",\n  \"log\": \"I will use action KeyboardPress to hit the Enter key.\",\n  \"more_actions_needed_by_instruction\": false,\n  \"action\": {\n    \"type\": \"KeyboardPress\",\n    \"param\": {\n      \"value\": \"Enter\"\n    }\n  },\n  \"sleep\": 5000\n}",
        "usage": {
-          "prompt_tokens": 2007,
-          "completion_tokens": 86,
-          "total_tokens": 2093,
+          "prompt_tokens": 2025,
+          "completion_tokens": 90,
+          "total_tokens": 2115,
          "completion_tokens_details": {
-            "text_tokens": 86
+            "text_tokens": 90
          },
          "prompt_tokens_details": {
-            "text_tokens": 809,
+            "text_tokens": 827,
            "image_tokens": 1198
          }
        }
@ -175,7 +175,7 @@
          "type": "Input",
          "locate": {
            "prompt": "input box for adding a new todo",
-            "bbox": [513, 128, 1067, 194]
+            "bbox": [513, 127, 1068, 194]
          },
          "param": {
            "value": "hello"
@ -186,23 +186,23 @@
            "type": "Input",
            "locate": {
              "prompt": "input box for adding a new todo",
-              "bbox": [513, 128, 1067, 194]
+              "bbox": [513, 127, 1068, 194]
            },
            "param": {
              "value": "hello"
            }
          }
        ],
-        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n  \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n  \"more_actions_needed_by_instruction\": false,\n  \"action\": {\n    \"type\": \"Input\",\n    \"locate\": {\n      \"bbox_2d\": [\n        513,\n        128,\n        1067,\n        194\n      ],\n      \"prompt\": \"input box for adding a new todo\"\n    },\n    \"param\": {\n      \"value\": \"hello\"\n    }\n  }\n}",
+        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"type 'hello' in the input box\",\n  \"log\": \"I will use action Input to type 'hello' in the input box first.\",\n  \"more_actions_needed_by_instruction\": false,\n  \"action\": {\n    \"type\": \"Input\",\n    \"locate\": {\n      \"bbox_2d\": [\n        513,\n        127,\n        1068,\n        194\n      ],\n      \"prompt\": \"input box for adding a new todo\"\n    },\n    \"param\": {\n      \"value\": \"hello\"\n    }\n  }\n}",
        "usage": {
-          "prompt_tokens": 1953,
+          "prompt_tokens": 1971,
          "completion_tokens": 137,
-          "total_tokens": 2090,
+          "total_tokens": 2108,
          "completion_tokens_details": {
            "text_tokens": 137
          },
          "prompt_tokens_details": {
-            "text_tokens": 755,
+            "text_tokens": 773,
            "image_tokens": 1198
          }
        }
@ -211,15 +211,15 @@
      "annotation_index_id": 5,
      "response_rect": {
        "left": 513,
-        "top": 128,
-        "width": 554,
-        "height": 66
+        "top": 127,
+        "width": 555,
+        "height": 67
      }
    },
    {
      "prompt": "click OK",
      "response_planning": {
-        "error": "Failed to plan actions: No 'OK' button found in the current view."
+        "error": "Failed to plan actions: There is no visible 'OK' button in the screenshot provided."
      }
    },
    {
@ -233,14 +233,14 @@
        "actions": [],
        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"check if there is an 'OK' button and click it\",\n  \"log\": \"There is no 'OK' button visible in the screenshot. No action needed.\",\n  \"error\": null,\n  \"more_actions_needed_by_instruction\": false,\n  \"action\": null\n}",
        "usage": {
-          "prompt_tokens": 1956,
+          "prompt_tokens": 1974,
          "completion_tokens": 73,
-          "total_tokens": 2029,
+          "total_tokens": 2047,
          "completion_tokens_details": {
            "text_tokens": 73
          },
          "prompt_tokens_details": {
-            "text_tokens": 758,
+            "text_tokens": 776,
            "image_tokens": 1198
          }
        }
@ -250,22 +250,22 @@
      "prompt": "if there is an 'OK' button, click it. If not, try again in next step",
      "response_planning": {
        "what_the_user_wants_to_do_next_by_instruction": "check if there is an 'OK' button and click it if found",
-        "log": "I will check the screenshot for an 'OK' button. If not found, I will wait for the next step.",
+        "log": "I will check the screenshot for an 'OK' button. If not found, I will log that no action should be taken.",
        "error": null,
        "more_actions_needed_by_instruction": true,
        "action": null,
-        "sleep": 1000,
+        "sleep": null,
        "actions": [],
-        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"check if there is an 'OK' button and click it if found\",\n  \"log\": \"I will check the screenshot for an 'OK' button. If not found, I will wait for the next step.\",\n  \"error\": null,\n  \"more_actions_needed_by_instruction\": true,\n  \"action\": null,\n  \"sleep\": 1000\n}",
+        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"check if there is an 'OK' button and click it if found\",\n  \"log\": \"I will check the screenshot for an 'OK' button. If not found, I will log that no action should be taken.\",\n  \"error\": null,\n  \"more_actions_needed_by_instruction\": false,\n  \"action\": null,\n  \"sleep\": null\n}",
        "usage": {
-          "prompt_tokens": 1965,
-          "completion_tokens": 97,
-          "total_tokens": 2062,
+          "prompt_tokens": 1983,
+          "completion_tokens": 95,
+          "total_tokens": 2078,
          "completion_tokens_details": {
-            "text_tokens": 97
+            "text_tokens": 95
          },
          "prompt_tokens_details": {
-            "text_tokens": 767,
+            "text_tokens": 785,
            "image_tokens": 1198
          }
        }
@ -276,35 +276,33 @@
      "response_planning": {
        "what_the_user_wants_to_do_next_by_instruction": "Move the mouse to the second item in the task list and click the delete button on the right of the second task",
        "log": "I will use action Hover to move the mouse to the second item in the task list first.",
-        "error": null,
        "more_actions_needed_by_instruction": true,
        "action": {
          "type": "Hover",
          "locate": {
            "prompt": "second item in the task list",
-            "bbox": [574, 269, 693, 298]
+            "bbox": [574, 276, 693, 296]
          }
        },
-        "sleep": 200,
        "actions": [
          {
            "type": "Hover",
            "locate": {
              "prompt": "second item in the task list",
-              "bbox": [574, 269, 693, 298]
+              "bbox": [574, 276, 693, 296]
            }
          }
        ],
-        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"Move the mouse to the second item in the task list and click the delete button on the right of the second task\",\n  \"log\": \"I will use action Hover to move the mouse to the second item in the task list first.\",\n  \"error\": null,\n  \"more_actions_needed_by_instruction\": true,\n  \"action\": {\n    \"type\": \"Hover\",\n    \"locate\": {\n      \"bbox_2d\": [\n        574,\n        269,\n        693,\n        298\n      ],\n      \"prompt\": \"second item in the task list\"\n    }\n  },\n  \"sleep\": 200\n}",
+        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"Move the mouse to the second item in the task list and click the delete button on the right of the second task\",\n  \"log\": \"I will use action Hover to move the mouse to the second item in the task list first.\",\n  \"more_actions_needed_by_instruction\": true,\n  \"action\": {\n    \"type\": \"Hover\",\n    \"locate\": {\n      \"bbox_2d\": [\n        574,\n        276,\n        693,\n        296\n      ],\n      \"prompt\": \"second item in the task list\"\n    }\n  }\n}",
        "usage": {
-          "prompt_tokens": 1968,
-          "completion_tokens": 154,
-          "total_tokens": 2122,
+          "prompt_tokens": 1986,
+          "completion_tokens": 139,
+          "total_tokens": 2125,
          "completion_tokens_details": {
-            "text_tokens": 154
+            "text_tokens": 139
          },
          "prompt_tokens_details": {
-            "text_tokens": 770,
+            "text_tokens": 788,
            "image_tokens": 1198
          }
        }
@ -313,22 +311,22 @@
      "annotation_index_id": 9,
      "response_rect": {
        "left": 574,
-        "top": 269,
+        "top": 276,
        "width": 119,
-        "height": 29
+        "height": 20
      }
    },
    {
      "prompt": "在任务框 input 输入 今天学习 JS，按回车键",
      "response_planning": {
        "what_the_user_wants_to_do_next_by_instruction": "在任务框 input 输入 今天学习 JS，按回车键",
-        "log": "我将使用 Input 动作来填写任务框中的内容。",
+        "log": "我将使用 Input 动作来填写任务框，并输入 '今天学习 JS'。",
        "more_actions_needed_by_instruction": true,
        "action": {
          "type": "Input",
          "locate": {
-            "prompt": "任务输入框",
-            "bbox": [513, 127, 1068, 197]
+            "prompt": "任务框的输入区域",
+            "bbox": [513, 128, 1067, 194]
          },
          "param": {
            "value": "今天学习 JS"
@ -338,24 +336,24 @@
          {
            "type": "Input",
            "locate": {
-              "prompt": "任务输入框",
-              "bbox": [513, 127, 1068, 197]
+              "prompt": "任务框的输入区域",
+              "bbox": [513, 128, 1067, 194]
            },
            "param": {
              "value": "今天学习 JS"
            }
          }
        ],
-        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"在任务框 input 输入 今天学习 JS，按回车键\",\n  \"log\": \"我将使用 Input 动作来填写任务框中的内容。\",\n  \"more_actions_needed_by_instruction\": true,\n  \"action\": {\n    \"type\": \"Input\",\n    \"locate\": {\n      \"bbox_2d\": [\n        513,\n        127,\n        1068,\n        197\n      ],\n      \"prompt\": \"任务输入框\"\n    },\n    \"param\": {\n      \"value\": \"今天学习 JS\"\n    }\n  }\n}",
+        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"在任务框 input 输入 今天学习 JS，按回车键\",\n  \"log\": \"我将使用 Input 动作来填写任务框，并输入 '今天学习 JS'。\",\n  \"more_actions_needed_by_instruction\": true,\n  \"action\": {\n    \"type\": \"Input\",\n    \"locate\": {\n      \"bbox_2d\": [\n        513,\n        128,\n        1067,\n        194\n      ],\n      \"prompt\": \"任务框的输入区域\"\n    },\n    \"param\": {\n      \"value\": \"今天学习 JS\"\n    }\n  }\n}",
        "usage": {
-          "prompt_tokens": 1959,
-          "completion_tokens": 139,
-          "total_tokens": 2098,
+          "prompt_tokens": 1977,
+          "completion_tokens": 146,
+          "total_tokens": 2123,
          "completion_tokens_details": {
-            "text_tokens": 139
+            "text_tokens": 146
          },
          "prompt_tokens_details": {
-            "text_tokens": 761,
+            "text_tokens": 779,
            "image_tokens": 1198
          }
        }
@ -364,10 +362,103 @@
      "annotation_index_id": 10,
      "response_rect": {
        "left": 513,
-        "top": 127,
-        "width": 555,
-        "height": 70
+        "top": 128,
+        "width": 554,
+        "height": 66
      }
+    },
+    {
+      "prompt": "Click the 'clear completed' button",
+      "action_context": "Always click the input box first before doing anything else",
+      "response_planning": {
+        "what_the_user_wants_to_do_next_by_instruction": "Click the 'clear completed' button",
+        "log": "I will use action Tap to click the input box first before clicking the 'clear completed' button.",
+        "error": null,
+        "more_actions_needed_by_instruction": true,
+        "action": {
+          "type": "Tap",
+          "locate": {
+            "prompt": "input box",
+            "bbox": [574, 156, 839, 176]
+          }
+        },
+        "sleep": 500,
+        "actions": [
+          {
+            "type": "Tap",
+            "locate": {
+              "prompt": "input box",
+              "bbox": [574, 156, 839, 176]
+            }
+          }
+        ],
+        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"Click the 'clear completed' button\",\n  \"log\": \"I will use action Tap to click the input box first before clicking the 'clear completed' button.\",\n  \"error\": null,\n  \"more_actions_needed_by_instruction\": true,\n  \"action\": {\n    \"type\": \"Tap\",\n    \"locate\": {\n      \"bbox_2d\": [\n        574,\n        156,\n        839,\n        176\n      ],\n      \"prompt\": \"input box\"\n    }\n  },\n  \"sleep\": 500\n}",
+        "usage": {
+          "prompt_tokens": 1979,
+          "completion_tokens": 135,
+          "total_tokens": 2114,
+          "completion_tokens_details": {
+            "text_tokens": 135
+          },
+          "prompt_tokens_details": {
+            "text_tokens": 781,
+            "image_tokens": 1198
+          }
+        }
+      },
+      "response_rect": {
+        "left": 574,
+        "top": 156,
+        "width": 265,
+        "height": 20
+      },
+      "annotation_index_id": 11
+    },
+    {
+      "prompt": "Click the 'clear completed' button",
+      "action_context": "Always click the input box first before doing anything else",
+      "log": "I will use action Tap to click the input box first before doing anything else.",
+      "response_planning": {
+        "what_the_user_wants_to_do_next_by_instruction": "Click the 'clear completed' button",
+        "log": "I will use action Tap to click the 'clear completed' button next.",
+        "more_actions_needed_by_instruction": false,
+        "action": {
+          "type": "Tap",
+          "locate": {
+            "prompt": "'Clear completed' button",
+            "bbox": [946, 387, 1050, 404]
+          }
+        },
+        "actions": [
+          {
+            "type": "Tap",
+            "locate": {
+              "prompt": "'Clear completed' button",
+              "bbox": [946, 387, 1050, 404]
+            }
+          }
+        ],
+        "rawResponse": "{\n  \"what_the_user_wants_to_do_next_by_instruction\": \"Click the 'clear completed' button\",\n  \"log\": \"I will use action Tap to click the 'clear completed' button next.\",\n  \"more_actions_needed_by_instruction\": false,\n  \"action\": {\n    \"type\": \"Tap\",\n    \"locate\": {\n      \"bbox_2d\": [\n        946,\n        387,\n        1050,\n        404\n      ],\n      \"prompt\": \"'Clear completed' button\"\n    }\n  }\n}",
+        "usage": {
+          "prompt_tokens": 2027,
+          "completion_tokens": 118,
+          "total_tokens": 2145,
+          "completion_tokens_details": {
+            "text_tokens": 118
+          },
+          "prompt_tokens_details": {
+            "text_tokens": 829,
+            "image_tokens": 1198
+          }
+        }
+      },
+      "response_rect": {
+        "left": 946,
+        "top": 387,
+        "width": 104,
+        "height": 17
+      },
+      "annotation_index_id": 12
    }
  ]
 }
--- a/packages/evaluation/page-cases/planning/todo-vl.json-planning-coordinates-annotated.png
+++ b/packages/evaluation/page-cases/planning/todo-vl.json-planning-coordinates-annotated.png
--- a/packages/evaluation/tests/llm-planning.test.ts
+++ b/packages/evaluation/tests/llm-planning.test.ts
@ -79,9 +79,9 @@ describe.skipIf(vlMode)('ai planning - by element', () => {

 const vlCases = [
  'todo-vl',
-  'aweme-login-vl',
-  'antd-form-vl',
-  'antd-tooltip-vl',
+  // 'aweme-login-vl',
+  // 'antd-form-vl',
+  // 'antd-tooltip-vl',
 ];

 describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
@ -117,6 +117,7 @@ describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
            res = await plan(prompt, {
              log: testCase.log,
              context,
+              actionContext: testCase.action_context,
            });
          } catch (error) {
            res = error as Error;
--- a/packages/evaluation/tests/util.ts
+++ b/packages/evaluation/tests/util.ts
@ -20,6 +20,7 @@ export type TestCase = {
  response_planning?: PlanningAIResponse;
  expected?: boolean;
  annotation_index_id?: number;
+  action_context?: string;
 };

 export type InspectAiTestCase = {
--- a/packages/web-integration/src/common/agent.ts
+++ b/packages/web-integration/src/common/agent.ts
@ -52,6 +52,7 @@ export interface PageAgentOpt {
  /* if auto print report msg, default true */
  autoPrintReportMsg?: boolean;
  onTaskStartTip?: OnTaskStartTip;
+  aiActionContext?: string;
 }

 export class PageAgent<PageType extends WebPage = WebPage> {
@ -119,6 +120,10 @@ export class PageAgent<PageType extends WebPage = WebPage> {
    });
  }

+  async setAIActionContext(prompt: string) {
+    this.opts.aiActionContext = prompt;
+  }
+
  resetDump() {
    this.dump = {
      groupName: this.opts.groupName!,
@ -283,7 +288,7 @@ export class PageAgent<PageType extends WebPage = WebPage> {
  async aiAction(taskPrompt: string) {
    const { executor } = await (getAIConfigInBoolean(MIDSCENE_USE_VLM_UI_TARS)
      ? this.taskExecutor.actionToGoal(taskPrompt)
-      : this.taskExecutor.action(taskPrompt));
+      : this.taskExecutor.action(taskPrompt, this.opts.aiActionContext));

    this.afterTaskRunning(executor);
  }
--- a/packages/web-integration/src/common/tasks.ts
+++ b/packages/web-integration/src/common/tasks.ts
@ -511,6 +511,7 @@ export class PageTaskExecutor {
    userInstruction: string,
    cacheGroup: ReturnType<TaskCache['getCacheGroupByPrompt']>,
    log?: string,
+    actionContext?: string,
  ) {
    const task: ExecutionTaskPlanningApply = {
      type: 'Planning',
@ -555,6 +556,7 @@ export class PageTaskExecutor {
          planResult = await plan(param.userInstruction, {
            context: pageContext,
            log: param.log,
+            actionContext,
          });
        }

@ -761,14 +763,22 @@ export class PageTaskExecutor {
    };
  }

-  async action(userPrompt: string): Promise<ExecutionResult> {
+  async action(
+    userPrompt: string,
+    actionContext?: string,
+  ): Promise<ExecutionResult> {
    const taskExecutor = new Executor(taskTitleStr('Action', userPrompt), {
      onTaskStart: this.onTaskStartCallback,
    });

    const cacheGroup = this.taskCache.getCacheGroupByPrompt(userPrompt);
    let planningTask: ExecutionTaskPlanningApply | null =
-      this.planningTaskFromPrompt(userPrompt, cacheGroup);
+      this.planningTaskFromPrompt(
+        userPrompt,
+        cacheGroup,
+        undefined,
+        actionContext,
+      );
    let result: any;
    let replanCount = 0;
    const logList: string[] = [];
@ -825,6 +835,7 @@ export class PageTaskExecutor {
        userPrompt,
        cacheGroup,
        logList.length > 0 ? `- ${logList.join('\n- ')}` : undefined,
+        actionContext,
      );
      replanCount++;
    }
--- a/packages/web-integration/src/puppeteer/agent-launcher.ts
+++ b/packages/web-integration/src/puppeteer/agent-launcher.ts
@ -177,6 +177,7 @@ export async function puppeteerAgentForTarget(
    autoPrintReportMsg: false,
    testId: preference?.testId,
    cacheId: preference?.cacheId,
+    aiActionContext: target.aiActionContext,
    forceSameTabNavigation:
      typeof target.forceSameTabNavigation !== 'undefined'
        ? target.forceSameTabNavigation