fix: race condition in bridge mode (#440)

2025-12-27 15:10:20 +00:00 · 2025-03-07 17:38:46 +08:00 · 2025-03-07 17:38:46 +08:00 · 372b67f16c
commit 372b67f16c
parent 5d63ef9151
12 changed files with 112 additions and 98 deletions
--- a/packages/cli/src/yaml-runner.ts
+++ b/packages/cli/src/yaml-runner.ts
@ -97,7 +97,7 @@ export async function playYamlFiles(
        target.cookie
      ) {
        console.warn(
-          'puppeteer options (userAgent, viewportWidth, viewportHeight, viewportScale, waitForNetworkIdle, cookie) are not supported in bridge mode, will be ignored',
+          'puppeteer options (userAgent, viewportWidth, viewportHeight, viewportScale, waitForNetworkIdle, cookie) are not supported in bridge mode. They will be ignored.',
        );
      }

--- a/packages/core/src/ai-model/llm-planning.ts
+++ b/packages/core/src/ai-model/llm-planning.ts
@ -88,12 +88,18 @@ export async function plan(
  }

  assert(planFromAI, "can't get plans from AI");
-  assert(
-    actions.length > 0 ||
-      !returnValue.more_actions_needed_by_instruction ||
-      returnValue.sleep,
-    `Failed to plan actions: ${planFromAI.error || '(no error details)'}`,
-  );
+  assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
+
+  if (
+    actions.length === 0 &&
+    returnValue.more_actions_needed_by_instruction &&
+    !returnValue.sleep
+  ) {
+    console.warn(
+      'No actions planned for the prompt, but model said more actions are needed:',
+      userInstruction,
+    );
+  }

  return returnValue;
 }
--- a/packages/core/src/ai-model/prompt/llm-planning.ts
+++ b/packages/core/src/ai-model/prompt/llm-planning.ts
@ -8,9 +8,9 @@ import type { ResponseFormatJSONSchema } from 'openai/resources';
 import { samplePageDescription } from './util';

 // Note: put the log field first to trigger the CoT
-const commonOutputFields = `"log": string, // Log what this action(s) you just planned do. Use the same language as the user's instruction.
+const commonOutputFields = `"log"?: string, // Log what this action(s) you just planned do. Use the same language as the user's instruction. Omit this field if there is an error and error message is provided.
  "more_actions_needed_by_instruction": boolean, // Consider if all the actions described in the instruction have been covered by this action and logs. If so, set this field to false. Otherwise, you must have a clear reason what the remaining actions are.
-  "error"?: string // Error messages about unexpected situations, if any. Use the same language as the user's instruction.`;
+  "error"?: string // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.`;

 const qwenLocateParam =
  'locate: {bbox_2d: [number, number, number, number], prompt: string }';
--- a/packages/core/tests/unit-test/prompt/snapshots/prompt.test.ts.snap
+++ b/packages/core/tests/unit-test/prompt/snapshots/prompt.test.ts.snap
@ -234,9 +234,9 @@ The JSON format is as follows:
  "actions": [
    // ... some actions
  ],
-  "log": string, // Log what this action(s) you just planned do. Use the same language as the user's instruction.
+  "log"?: string, // Log what this action(s) you just planned do. Use the same language as the user's instruction. Omit this field if there is an error and error message is provided.
  "more_actions_needed_by_instruction": boolean, // Consider if all the actions described in the instruction have been covered by this action and logs. If so, set this field to false. Otherwise, you must have a clear reason what the remaining actions are.
-  "error"?: string // Error messages about unexpected situations, if any. Use the same language as the user's instruction.
+  "error"?: string // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
 }

 ## Examples
@ -341,9 +341,9 @@ Return in JSON format:
    } | null,
  ,
  "sleep"?: number, // The sleep time after the action, in milliseconds.
-  "log": string, // Log what this action(s) you just planned do. Use the same language as the user's instruction.
+  "log"?: string, // Log what this action(s) you just planned do. Use the same language as the user's instruction. Omit this field if there is an error and error message is provided.
  "more_actions_needed_by_instruction": boolean, // Consider if all the actions described in the instruction have been covered by this action and logs. If so, set this field to false. Otherwise, you must have a clear reason what the remaining actions are.
-  "error"?: string // Error messages about unexpected situations, if any. Use the same language as the user's instruction.
+  "error"?: string // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
 }
 "
 `;
--- a/packages/evaluation/page-cases/planning/aweme-login-vl.json
+++ b/packages/evaluation/page-cases/planning/aweme-login-vl.json
@ -5,37 +5,7 @@
      "prompt": "type 'user' in the username input box, type '123456' in the password input box",
      "log": "type 'user' in the username input box",
      "response_planning": {
-        "action": {
-          "type": "Input",
-          "locate": {
-            "prompt": "password input box",
-            "bbox": [493, 417, 786, 465]
-          },
-          "param": {
-            "value": "123456"
-          }
-        },
-        "sleep": 500,
-        "log": "type '123456' in the password input box",
-        "more_actions_needed_by_instruction": false,
-        "actions": [
-          {
-            "type": "Input",
-            "locate": {
-              "prompt": "password input box",
-              "bbox": [493, 417, 786, 465]
-            },
-            "param": {
-              "value": "123456"
-            }
-          }
-        ],
-        "rawResponse": "{\n  \"action\": {\n    \"type\": \"Input\",\n    \"locate\": {\n      \"bbox_2d\": [\n        493,\n        417,\n        786,\n        465\n      ],\n      \"prompt\": \"password input box\"\n    },\n    \"param\": {\n      \"value\": \"123456\"\n    }\n  },\n  \"sleep\": 500,\n  \"log\": \"type '123456' in the password input box\",\n  \"more_actions_needed_by_instruction\": false\n}",
-        "usage": {
-          "prompt_tokens": 2000,
-          "completion_tokens": 121,
-          "total_tokens": 2121
-        }
+        "error": "Failed to plan actions: The current screen does not have a username or password input box. It is showing a verification code login interface."
      }
    }
  ]
--- a/packages/evaluation/page-cases/planning/todo-vl.json
+++ b/packages/evaluation/page-cases/planning/todo-vl.json
@ -8,7 +8,7 @@
          "type": "Input",
          "locate": {
            "prompt": "input box for adding a new todo",
-            "bbox": [513, 127, 1068, 194]
+            "bbox": [513, 128, 1067, 194]
          },
          "param": {
            "value": "hello"
@ -22,18 +22,18 @@
            "type": "Input",
            "locate": {
              "prompt": "input box for adding a new todo",
-              "bbox": [513, 127, 1068, 194]
+              "bbox": [513, 128, 1067, 194]
            },
            "param": {
              "value": "hello"
            }
          }
        ],
-        "rawResponse": "{\n  \"action\": {\n    \"type\": \"Input\",\n    \"locate\": {\n      \"bbox_2d\": [\n        513,\n        127,\n        1068,\n        194\n      ],\n      \"prompt\": \"input box for adding a new todo\"\n    },\n    \"param\": {\n      \"value\": \"hello\"\n    }\n  },\n  \"sleep\": 5000,\n  \"log\": \"输入 'hello' 到输入框中，然后等待5秒。\",\n  \"more_actions_needed_by_instruction\": true\n}",
+        "rawResponse": "{\n  \"action\": {\n    \"type\": \"Input\",\n    \"locate\": {\n      \"bbox_2d\": [\n        513,\n        128,\n        1067,\n        194\n      ],\n      \"prompt\": \"input box for adding a new todo\"\n    },\n    \"param\": {\n      \"value\": \"hello\"\n    }\n  },\n  \"sleep\": 5000,\n  \"log\": \"输入 'hello' 到输入框中，然后等待5秒。\",\n  \"more_actions_needed_by_instruction\": true\n}",
        "usage": {
-          "prompt_tokens": 1782,
-          "completion_tokens": 122,
-          "total_tokens": 1904
+          "completion_tokens": 123,
+          "prompt_tokens": 1815,
+          "total_tokens": 1938
        }
      }
    },
@ -41,14 +41,15 @@
      "prompt": "if there are five entries in the list, click the delete button of the second entry and wait 3s. Otherwise, do nothing.",
      "response_planning": {
        "action": null,
+        "sleep": 0,
        "log": "There are only three entries in the list, so no action is needed.",
        "more_actions_needed_by_instruction": false,
        "actions": [],
-        "rawResponse": "{\n  \"action\": null,\n  \"log\": \"There are only three entries in the list, so no action is needed.\",\n  \"more_actions_needed_by_instruction\": false\n}",
+        "rawResponse": "{\n  \"action\": null,\n  \"sleep\": 0,\n  \"log\": \"There are only three entries in the list, so no action is needed.\",\n  \"more_actions_needed_by_instruction\": false\n}",
        "usage": {
-          "prompt_tokens": 1793,
-          "completion_tokens": 38,
-          "total_tokens": 1831
+          "completion_tokens": 46,
+          "prompt_tokens": 1826,
+          "total_tokens": 1872
        }
      }
    },
@ -63,7 +64,7 @@
          }
        },
        "sleep": null,
-        "log": "press the Enter key to submit the input.",
+        "log": "hit enter",
        "more_actions_needed_by_instruction": false,
        "error": null,
        "actions": [
@ -74,11 +75,11 @@
            }
          }
        ],
-        "rawResponse": "{\n  \"action\": {\n    \"type\": \"KeyboardPress\",\n    \"param\": {\n      \"value\": \"Enter\"\n    }\n  },\n  \"sleep\": null,\n  \"log\": \"press the Enter key to submit the input.\",\n  \"more_actions_needed_by_instruction\": false,\n  \"error\": null\n}",
+        "rawResponse": "{\n  \"action\": {\n    \"type\": \"KeyboardPress\",\n    \"param\": {\n      \"value\": \"Enter\"\n    }\n  },\n  \"sleep\": null,\n  \"log\": \"hit enter\",\n  \"more_actions_needed_by_instruction\": false,\n  \"error\": null\n}",
        "usage": {
-          "prompt_tokens": 1823,
-          "completion_tokens": 71,
-          "total_tokens": 1894
+          "completion_tokens": 66,
+          "prompt_tokens": 1856,
+          "total_tokens": 1922
        }
      }
    },
@ -89,7 +90,7 @@
          "type": "Input",
          "locate": {
            "prompt": "input box for adding a new todo",
-            "bbox": [513, 127, 1068, 194]
+            "bbox": [513, 128, 1067, 194]
          },
          "param": {
            "value": "hello"
@ -102,18 +103,18 @@
            "type": "Input",
            "locate": {
              "prompt": "input box for adding a new todo",
-              "bbox": [513, 127, 1068, 194]
+              "bbox": [513, 128, 1067, 194]
            },
            "param": {
              "value": "hello"
            }
          }
        ],
-        "rawResponse": "{\n  \"action\": {\n    \"type\": \"Input\",\n    \"locate\": {\n      \"bbox_2d\": [\n        513,\n        127,\n        1068,\n        194\n      ],\n      \"prompt\": \"input box for adding a new todo\"\n    },\n    \"param\": {\n      \"value\": \"hello\"\n    }\n  },\n  \"log\": \"输入 'hello' 到输入框中。\",\n  \"more_actions_needed_by_instruction\": false\n}",
+        "rawResponse": "{\n  \"action\": {\n    \"type\": \"Input\",\n    \"locate\": {\n      \"bbox_2d\": [\n        513,\n        128,\n        1067,\n        194\n      ],\n      \"prompt\": \"input box for adding a new todo\"\n    },\n    \"param\": {\n      \"value\": \"hello\"\n    }\n  },\n  \"log\": \"输入 'hello' 到输入框中。\",\n  \"more_actions_needed_by_instruction\": false\n}",
        "usage": {
-          "prompt_tokens": 1774,
-          "completion_tokens": 107,
-          "total_tokens": 1881
+          "completion_tokens": 108,
+          "prompt_tokens": 1807,
+          "total_tokens": 1915
        }
      }
    },
@ -121,15 +122,13 @@
      "prompt": "click OK",
      "response_planning": {
        "action": null,
-        "log": "There is no 'OK' button visible on the current screen to click.",
-        "more_actions_needed_by_instruction": false,
-        "error": "The instruction cannot be completed as there is no 'OK' button present.",
+        "error": "There is no 'OK' button visible in the screenshot.",
        "actions": [],
-        "rawResponse": "{\n  \"action\": null,\n  \"log\": \"There is no 'OK' button visible on the current screen to click.\",\n  \"more_actions_needed_by_instruction\": false,\n  \"error\": \"The instruction cannot be completed as there is no 'OK' button present.\"\n}",
+        "rawResponse": "{\n  \"action\": null,\n  \"error\": \"There is no 'OK' button visible in the screenshot.\"\n}",
        "usage": {
-          "prompt_tokens": 1768,
-          "completion_tokens": 58,
-          "total_tokens": 1826
+          "completion_tokens": 26,
+          "prompt_tokens": 1801,
+          "total_tokens": 1827
        }
      }
    },
@ -142,9 +141,24 @@
        "actions": [],
        "rawResponse": "{\n  \"action\": null,\n  \"log\": \"There is no 'OK' button on the current screen.\",\n  \"more_actions_needed_by_instruction\": false\n}",
        "usage": {
-          "prompt_tokens": 1777,
-          "completion_tokens": 35,
-          "total_tokens": 1812
+          "completion_tokens": 36,
+          "prompt_tokens": 1810,
+          "total_tokens": 1846
+        }
+      }
+    },
+    {
+      "prompt": "if there is an 'OK' button, click it. If not, try again in next step",
+      "response_planning": {
+        "action": null,
+        "log": "There is no 'OK' button on the current screen.",
+        "more_actions_needed_by_instruction": true,
+        "actions": [],
+        "rawResponse": "{\n  \"action\": null,\n  \"log\": \"There is no 'OK' button on the current screen.\",\n  \"more_actions_needed_by_instruction\": true\n}",
+        "usage": {
+          "completion_tokens": 36,
+          "prompt_tokens": 1819,
+          "total_tokens": 1855
        }
      }
    }
--- a/packages/evaluation/src/test-analyzer.ts
+++ b/packages/evaluation/src/test-analyzer.ts
@ -167,6 +167,20 @@ ${errorMsg ? `Error: ${errorMsg}` : ''}
    result: ActualResult | Error,
  ): true | Error {
    const distanceThreshold = 16;
+
+    if (testCase.response_planning?.error) {
+      if (!(result instanceof Error)) {
+        const msg = `Expected error: ${testCase.response_planning.error}, but got ${JSON.stringify(result, null, 2)}, the prompt is: ${testCase.prompt}`;
+        return new Error(msg);
+      }
+      return true;
+    }
+
+    if (result instanceof Error) {
+      const msg = `got error: ${result}, but expected?.error is not set, the prompt is: ${testCase.prompt}`;
+      return new Error(msg);
+    }
+
    // compare coordinates
    if ('rawResponse' in result && result.rawResponse.bbox) {
      assert(testCase.response_bbox, 'testCase.response_bbox is required');
@ -241,19 +255,6 @@ ${errorMsg ? `Error: ${errorMsg}` : ''}
      return true;
    }

-    if (testCase.response_planning?.error) {
-      if (!(result instanceof Error)) {
-        const msg = `got error: ${result}, but expected?.error is not set, the prompt is: ${testCase.prompt}`;
-        return new Error(msg);
-      }
-      return true;
-    }
-
-    if (result instanceof Error) {
-      const msg = `got error: ${result}, but expected?.error is not set, the prompt is: ${testCase.prompt}`;
-      return new Error(msg);
-    }
-
    const msg = `unknown result type, can not compare, the prompt is: ${testCase.prompt}`;
    return new Error(msg);
  }
--- a/packages/evaluation/tests/llm-planning.test.ts
+++ b/packages/evaluation/tests/llm-planning.test.ts
@ -80,7 +80,7 @@ describe.skipIf(vlMode)('ai planning - by element', () => {
 });

 const vlCases = ['todo-vl', 'aweme-login-vl', 'antd-form-vl'];
-// const vlCases = ['todo-vl'];
+// const vlCases = ['aweme-login-vl'];

 describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
  vlCases.forEach((source) => {
@ -111,7 +111,6 @@ describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
            });
          } catch (error) {
            res = error as Error;
-            throw error;
          }

          if (process.env.UPDATE_ANSWER_DATA) {
--- a/packages/web-integration/src/bridge-mode/agent-cli-side.ts
+++ b/packages/web-integration/src/bridge-mode/agent-cli-side.ts
@ -98,6 +98,8 @@ export const getBridgePageInCliSide = (): ChromeExtensionPageCliSide => {
 };

 export class AgentOverChromeBridge extends PageAgent<ChromeExtensionPageCliSide> {
+  private destroyAfterDisconnectFlag?: boolean;
+
  constructor(opts?: PageAgentOpt & { closeNewTabsAfterDisconnect?: boolean }) {
    const page = getBridgePageInCliSide();
    super(
@ -108,10 +110,13 @@ export class AgentOverChromeBridge extends PageAgent<ChromeExtensionPageCliSide>
        },
      }),
    );
+    this.destroyAfterDisconnectFlag = opts?.closeNewTabsAfterDisconnect;
+  }

-    if (typeof opts?.closeNewTabsAfterDisconnect === 'boolean') {
+  async setDestroyOptionsAfterConnect() {
+    if (this.destroyAfterDisconnectFlag) {
      this.page.setDestroyOptions({
-        closeTab: opts.closeNewTabsAfterDisconnect,
+        closeTab: true,
      });
    }
  }
@ -119,11 +124,13 @@ export class AgentOverChromeBridge extends PageAgent<ChromeExtensionPageCliSide>
  async connectNewTabWithUrl(url: string, options?: BridgeConnectTabOptions) {
    await this.page.connectNewTabWithUrl(url, options);
    await sleep(500);
+    await this.setDestroyOptionsAfterConnect();
  }

  async connectCurrentTab(options?: BridgeConnectTabOptions) {
    await this.page.connectCurrentTab(options);
    await sleep(500);
+    await this.setDestroyOptionsAfterConnect();
  }

  async aiAction(prompt: string, options?: any) {
--- a/packages/web-integration/src/bridge-mode/page-browser-side.ts
+++ b/packages/web-integration/src/bridge-mode/page-browser-side.ts
@ -54,7 +54,7 @@ export class ExtensionBridgePageBrowserSide extends ChromeExtensionProxyPage {
          return this.onLogMessage(args[0] as string, 'status');
        }

-        const tabId = await this.getTabId();
+        const tabId = await this.getActiveTabId();
        if (!tabId || tabId === 0) {
          throw new Error('no tab is connected');
        }
@ -126,6 +126,8 @@ export class ExtensionBridgePageBrowserSide extends ChromeExtensionProxyPage {
    if (options?.forceSameTabNavigation) {
      this.forceSameTabNavigation = true;
    }
+
+    await this.setActiveTabId(tabId);
  }

  public async connectCurrentTab(
@ -143,6 +145,8 @@ export class ExtensionBridgePageBrowserSide extends ChromeExtensionProxyPage {
    if (options?.forceSameTabNavigation) {
      this.forceSameTabNavigation = true;
    }
+
+    await this.setActiveTabId(tabId);
  }

  public async setDestroyOptions(options: ChromePageDestroyOptions) {
--- a/packages/web-integration/src/chrome-extension/page.ts
+++ b/packages/web-integration/src/chrome-extension/page.ts
@ -46,7 +46,20 @@ export default class ChromeExtensionProxyPage implements AbstractPage {
    this.forceSameTabNavigation = forceSameTabNavigation;
  }

-  public async getTabId() {
+  public async setActiveTabId(tabId: number) {
+    if (this.activeTabId) {
+      throw new Error(
+        `Active tab id is already set, which is ${this.activeTabId}, cannot set it to ${tabId}`,
+      );
+    }
+    this.activeTabId = tabId;
+  }
+
+  public async getActiveTabId() {
+    return this.activeTabId;
+  }
+
+  public async getTabIdOrConnectToCurrentTab() {
    if (this.activeTabId) {
      // alway keep on the connected tab
      return this.activeTabId;
@ -78,7 +91,7 @@ export default class ChromeExtensionProxyPage implements AbstractPage {
      }

      try {
-        const currentTabId = await this.getTabId();
+        const currentTabId = await this.getTabIdOrConnectToCurrentTab();

        if (this.tabIdOfDebuggerAttached === currentTabId) {
          // already attached
@ -322,7 +335,7 @@ export default class ChromeExtensionProxyPage implements AbstractPage {
  }

  async url() {
-    const tabId = await this.getTabId();
+    const tabId = await this.getTabIdOrConnectToCurrentTab();
    const url = await chrome.tabs.get(tabId).then((tab) => tab.url);
    return url || '';
  }
--- a/packages/web-integration/src/common/task-cache.ts
+++ b/packages/web-integration/src/common/task-cache.ts
@ -2,7 +2,7 @@ import { existsSync, readFileSync } from 'node:fs';
 import { join } from 'node:path';
 import type { AIElementIdResponse, PlanningAIResponse } from '@midscene/core';
 import type { vlmPlanning } from '@midscene/core/ai-model';
-import { getAIConfig } from '@midscene/core/env';
+import { getAIConfig, getAIConfigInBoolean } from '@midscene/core/env';
 import {
  getLogDirByType,
  stringifyDumpData,
@ -251,7 +251,7 @@ export class TaskCache {
      return undefined;
    }
    const cacheFile = join(getLogDirByType('cache'), `${this.cacheId}.json`);
-    if (getAIConfig('MIDSCENE_CACHE') === 'true' && existsSync(cacheFile)) {
+    if (getAIConfigInBoolean('MIDSCENE_CACHE') && existsSync(cacheFile)) {
      try {
        const data = readFileSync(cacheFile, 'utf8');
        const jsonData = JSON.parse(data);