fix(llm): coords offset in vl locator (#545)

2025-12-28 07:30:02 +00:00 · 2025-04-08 17:56:15 +08:00 · 2025-04-08 17:56:15 +08:00 · 732f605144
commit 732f605144
parent 8582f86793
12 changed files with 269 additions and 166 deletions
--- a/packages/core/src/ai-model/common.ts
+++ b/packages/core/src/ai-model/common.ts
@ -13,6 +13,7 @@ import {

 import { vlLocateMode } from '@/env';
 import type { PlanningLocateParam } from '@/types';
+import { getDebug } from '@midscene/shared/logger';

 export type AIArgs = [
  ChatCompletionSystemMessageParam,
@ -43,6 +44,7 @@ export async function callAiFn<T>(
 }

 const defaultBboxSize = 20; // must be even number
+const debugInspectUtils = getDebug('ai:common');

 // transform the param of locate from qwen mode
 export function fillLocateParam(
@ -110,7 +112,12 @@ export function adaptDoubaoBbox(
  }

  // treat the bbox as a center point
-  if (bbox.length === 6 || bbox.length === 2) {
+  if (
+    bbox.length === 6 ||
+    bbox.length === 2 ||
+    bbox.length === 3 ||
+    bbox.length === 7
+  ) {
    return [
      Math.max(0, Math.round((bbox[0] * width) / 1000) - defaultBboxSize / 2),
      Math.max(0, Math.round((bbox[1] * height) / 1000) - defaultBboxSize / 2),
@ -161,6 +168,15 @@ export function adaptBboxToRect(
  offsetY = 0,
  errorMsg?: string,
 ): Rect {
+  debugInspectUtils(
+    'adaptBboxToRect',
+    bbox,
+    width,
+    height,
+    offsetX,
+    offsetY,
+    errorMsg || '',
+  );
  const [left, top, right, bottom] = adaptBbox(bbox, width, height, errorMsg);
  return {
    left: left + offsetX,
--- a/packages/core/src/ai-model/index.ts
+++ b/packages/core/src/ai-model/index.ts
@ -9,7 +9,6 @@ export {
  AiExtractElementInfo,
  AiAssert,
  AiLocateSection,
-  transformElementPositionToId,
 } from './inspect';

 export { plan } from './llm-planning';
--- a/packages/core/src/ai-model/inspect.ts
+++ b/packages/core/src/ai-model/inspect.ts
@ -2,6 +2,7 @@ import {
  MIDSCENE_USE_QWEN_VL,
  MIDSCENE_USE_VLM_UI_TARS,
  getAIConfigInBoolean,
+  vlLocateMode,
 } from '@/env';
 import type {
  AIAssertionResponse,
@ -64,114 +65,6 @@ const liteContextConfig = {
 const debugInspect = getDebug('ai:inspect');
 const debugSection = getDebug('ai:section');

-function transformToAbsoluteCoords(
-  relativePosition: { x: number; y: number },
-  size: Size,
-) {
-  return {
-    x: Number(((relativePosition.x / 1000) * size.width).toFixed(3)),
-    y: Number(((relativePosition.y / 1000) * size.height).toFixed(3)),
-  };
-}
-
-// let index = 0;
-export async function transformElementPositionToId(
-  aiResult: AIElementResponse | [number, number],
-  treeRoot: ElementTreeNode<BaseElement>,
-  size: { width: number; height: number },
-  searchAreaRect: Rect | undefined,
-  insertElementByPosition: (position: { x: number; y: number }) => BaseElement,
-) {
-  const emptyResponse: AIElementResponse = {
-    errors: [],
-    elements: [],
-  };
-
-  const elementAtPosition = (center: { x: number; y: number }) => {
-    const element = elementByPositionWithElementInfo(treeRoot, center);
-    const distanceToCenter = element
-      ? distance({ x: element.center[0], y: element.center[1] }, center)
-      : 0;
-    return distanceToCenter <= distanceThreshold ? element : undefined;
-  };
-
-  if ('bbox' in aiResult) {
-    if (
-      !Array.isArray(aiResult.bbox) ||
-      (aiResult.bbox as number[]).length !== 4
-    ) {
-      return emptyResponse;
-    }
-
-    const bbox: [number, number, number, number] = [
-      aiResult.bbox[0] + (searchAreaRect?.left || 0),
-      aiResult.bbox[1] + (searchAreaRect?.top || 0),
-      aiResult.bbox[2] + (searchAreaRect?.left || 0),
-      aiResult.bbox[3] + (searchAreaRect?.top || 0),
-    ];
-    const centerX = Math.round((bbox[0] + bbox[2]) / 2);
-    const centerY = Math.round((bbox[1] + bbox[3]) / 2);
-
-    let element = elementAtPosition({ x: centerX, y: centerY });
-
-    if (!element) {
-      element = insertElementByPosition({
-        x: centerX,
-        y: centerY,
-      });
-    }
-    assert(
-      element,
-      `inspect: no element found with coordinates: ${JSON.stringify(bbox)}`,
-    );
-    return {
-      errors: [],
-      elements: [
-        {
-          id: element.id,
-        },
-      ],
-      bbox,
-    };
-  }
-
-  if (Array.isArray(aiResult)) {
-    // [number, number] coord
-    const relativePosition = aiResult;
-    const absolutePosition = transformToAbsoluteCoords(
-      {
-        x: relativePosition[0],
-        y: relativePosition[1],
-      },
-      size,
-    );
-
-    let element = elementAtPosition(absolutePosition);
-    if (!element) {
-      element = insertElementByPosition(absolutePosition);
-    }
-
-    assert(
-      element,
-      `inspect: no id found with position: ${JSON.stringify({ absolutePosition })}`,
-    );
-
-    return {
-      errors: [],
-      elements: [
-        {
-          id: element.id,
-        },
-      ],
-    };
-  }
-
-  return {
-    errors: aiResult.errors,
-    elements: aiResult.elements,
-  };
-}
-
 function matchQuickAnswer(
  quickAnswer:
    | Partial<AISingleElementResponse>
@ -258,7 +151,7 @@ export async function AiLocateElement<
    pageDescription: description,
    targetElementDescription,
  });
-  const systemPrompt = systemPromptToLocateElement();
+  const systemPrompt = systemPromptToLocateElement(!!vlLocateMode());

  let imagePayload = screenshotBase64WithElementMarker || screenshotBase64;

@ -305,32 +198,51 @@ export async function AiLocateElement<
  const rawResponse = JSON.stringify(res.content);

  let resRect: Rect | undefined;
+  let matchedElements: AIElementLocatorResponse['elements'] =
+    'elements' in res.content ? res.content.elements : [];
+  let errors: AIElementLocatorResponse['errors'] | undefined =
+    'errors' in res.content ? res.content.errors : [];
  if ('bbox' in res.content && Array.isArray(res.content.bbox)) {
    const errorMsg = res.content.errors?.length
      ? `Failed to parse bbox: ${res.content.errors?.join(',')}`
      : '';
+
    resRect = adaptBboxToRect(
      res.content.bbox,
-      context.size.width,
-      context.size.height,
+      options.searchConfig?.rect?.width || context.size.width,
+      options.searchConfig?.rect?.height || context.size.height,
      options.searchConfig?.rect?.left,
      options.searchConfig?.rect?.top,
      errorMsg,
    );
    debugInspect('resRect', resRect);
-  }

-  const parseResult = await transformElementPositionToId(
-    res.content,
-    context.tree,
-    size,
-    options.searchConfig?.rect,
-    insertElementByPosition,
-  );
+    const rectCenter = {
+      x: resRect.left + resRect.width / 2,
+      y: resRect.top + resRect.height / 2,
+    };
+    let element = elementByPositionWithElementInfo(context.tree, rectCenter);
+
+    const distanceToCenter = element
+      ? distance({ x: element.center[0], y: element.center[1] }, rectCenter)
+      : 0;
+
+    if (!element || distanceToCenter > distanceThreshold) {
+      element = insertElementByPosition(rectCenter);
+    }
+
+    if (element) {
+      matchedElements = [element];
+      errors = [];
+    }
+  }

  return {
    rect: resRect,
-    parseResult,
+    parseResult: {
+      elements: matchedElements,
+      errors,
+    },
    rawResponse,
    elementById,
    usage: res.usage,
@ -394,7 +306,7 @@ export async function AiLocateSection(options: {
    debugSection('referenceBboxList %j', referenceBboxList);

    const referenceRects = referenceBboxList
-      .filter((bbox) => Array.isArray(bbox) && bbox.length === 4)
+      .filter((bbox) => Array.isArray(bbox))
      .map((bbox) => {
        return adaptBboxToRect(bbox, context.size.width, context.size.height);
      });
--- a/packages/core/src/ai-model/prompt/llm-locator.ts
+++ b/packages/core/src/ai-model/prompt/llm-locator.ts
@ -1,9 +1,8 @@
-import { vlLocateMode } from '@/env';
 import { PromptTemplate } from '@langchain/core/prompts';
 import type { ResponseFormatJSONSchema } from 'openai/resources';

-export function systemPromptToLocateElement() {
-  if (vlLocateMode()) {
+export function systemPromptToLocateElement(vlMode: boolean) {
+  if (vlMode) {
    return `
 ## Role:
 You are an expert in software testing.
@ -15,7 +14,7 @@ You are an expert in software testing.
 ## Output Format:
 \`\`\`json
 {
-  "bbox": [number, number, number, number],  // top-left x, top-left y, bottom-right x, bottom-right y
+  "bbox": [number, number, number, number],  // left, top, right, bottom
  "errors"?: string[]
 }
 \`\`\`
--- a/packages/core/src/ai-model/prompt/llm-section-locator.ts
+++ b/packages/core/src/ai-model/prompt/llm-section-locator.ts
@ -20,6 +20,8 @@ return in this JSON format:
  "error"?: string
 }
 \`\`\`
+
+In which, all the numbers in the \`bbox\` means the distance to the left, top, right, bottom of the page.
 `;
 }

--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@ -4,7 +4,6 @@ import { getLogDirByType, getVersion, setLogDir } from './utils';

 export {
  plan,
-  transformElementPositionToId,
  describeUserPage,
  AiLocateElement,
  AiAssert,
--- a/packages/core/tests/unit-test/prompt/snapshots/prompt.test.ts.snap
+++ b/packages/core/tests/unit-test/prompt/snapshots/prompt.test.ts.snap
@ -1,5 +1,166 @@
 // Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html

+exports[`system prompts > locator - 4o 1`] = `
+"
+## Role:
+You are an expert in software page image (2D) and page element text analysis.
+
+## Objective:
+- Identify elements in screenshots and text that match the user's description.
+- Return JSON data containing the selection reason and element ID.
+
+## Skills:
+- Image analysis and recognition
+- Multilingual text understanding
+- Software UI design and testing
+
+## Workflow:
+1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
+2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
+3. Found the required number of elements
+4. Return JSON data containing the selection reason and element ID.
+
+## Constraints:
+- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
+- Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
+- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
+- If no elements are found, the "elements" array should be empty.
+- The returned data must conform to the specified JSON format.
+- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
+
+## Output Format:
+
+Please return the result in JSON format as follows:
+
+\`\`\`json
+{
+  "elements": [
+    // If no matching elements are found, return an empty array []
+    {
+      "reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
+      "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
+      "id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
+    }
+    // More elements...
+  ],
+  "errors": [] // Array of strings containing any error messages
+}
+\`\`\`
+
+## Example:
+Example 1:
+Input Example:
+\`\`\`json
+// Description: "Shopping cart icon in the upper right corner"
+{
+  "description": "PLACEHOLDER", // Description of the target element
+  "screenshot": "path/screenshot.png",
+  "text": '{
+      "pageSize": {
+        "width": 400, // Width of the page
+        "height": 905 // Height of the page
+      },
+      "elementInfos": [
+        {
+          "id": "1231", // ID of the element
+          "indexId": "0", // Index of the element，The image is labeled to the left of the element
+          "attributes": { // Attributes of the element
+            "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
+            "src": "https://ap-southeast-3.m",
+            "class": ".img"
+          },
+          "content": "", // Text content of the element
+          "rect": {
+            "left": 280, // Distance from the left side of the page
+            "top": 8, // Distance from the top of the page
+            "width": 44, // Width of the element
+            "height": 44 // Height of the element
+          }
+        },
+        {
+          "id": "66551", // ID of the element
+          "indexId": "1", // Index of the element,The image is labeled to the left of the element
+          "attributes": { // Attributes of the element
+            "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
+            "src": "data:image/png;base64,iVBORw0KGgoAAAANSU...",
+            "class": ".icon"
+          },
+          "content": "", // Text content of the element
+          "rect": {
+            "left": 350, // Distance from the left side of the page
+            "top": 16, // Distance from the top of the page
+            "width": 25, // Width of the element
+            "height": 25 // Height of the element
+          }
+        },
+        ...
+        {
+          "id": "12344",
+          "indexId": "2", // Index of the element，The image is labeled to the left of the element
+          "attributes": {
+            "nodeType": "TEXT Node",
+            "class": ".product-name"
+          },
+          "center": [
+            288,
+            834
+          ],
+          "content": "Mango Drink",
+          "rect": {
+            "left": 188,
+            "top": 827,
+            "width": 199,
+            "height": 13
+          }
+        },
+        ...
+      ]
+    }
+  '
+}
+\`\`\`
+Output Example:
+\`\`\`json
+{
+  "elements": [
+    {
+      // Describe the reason for finding this element, replace with actual value in practice
+      "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
+      "text": "",
+      // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
+      "id": "1231"
+    }
+  ],
+  "errors": []
+}
+\`\`\`
+  
+  "
+`;
+
+exports[`system prompts > locator - qwen 1`] = `
+"
+## Role:
+You are an expert in software testing.
+
+## Objective:
+- Identify elements in screenshots and text that match the user's description.
+- Give the coordinates of the element that matches the user's description best in the screenshot.
+
+## Output Format:
+\`\`\`json
+{
+  "bbox": [number, number, number, number],  // left, top, right, bottom
+  "errors"?: string[]
+}
+\`\`\`
+
+Fields:
+* \`bbox\` is the bounding box of the element that matches the user's description best in the screenshot
+* \`errors\` is an optional array of error messages (if any)
+"
+`;
+
 exports[`system prompts > planning - 4o - response format 1`] = `
 {
  "json_schema": {
@ -389,5 +550,7 @@ return in this JSON format:
  "error"?: string
 }
 \`\`\`
+
+In which, all the numbers in the \`bbox\` means the distance to the left, top, right, bottom of the page.
 "
 `;
--- a/packages/core/tests/unit-test/prompt/prompt.test.ts
+++ b/packages/core/tests/unit-test/prompt/prompt.test.ts
@ -1,3 +1,4 @@
+import { systemPromptToLocateElement } from '@/ai-model';
 import {
  automationUserPrompt,
  generateTaskBackgroundContext,
@ -62,4 +63,14 @@ describe('system prompts', () => {
    const prompt = systemPromptToLocateSection();
    expect(prompt).toMatchSnapshot();
  });
+
+  it('locator - 4o', () => {
+    const prompt = systemPromptToLocateElement(false);
+    expect(prompt).toMatchSnapshot();
+  });
+
+  it('locator - qwen', () => {
+    const prompt = systemPromptToLocateElement(true);
+    expect(prompt).toMatchSnapshot();
+  });
 });
--- a/packages/evaluation/page-cases/inspect/aweme-login.json
+++ b/packages/evaluation/page-cases/inspect/aweme-login.json
@ -42,7 +42,8 @@
        "height": 49
      },
      "response_element": {
-        "id": "fcgao"
+        "id": "mfodf",
+        "indexId": 10
      }
    },
    {
@ -56,7 +57,8 @@
        "height": 50
      },
      "response_element": {
-        "id": "jgnil"
+        "id": "nhbof",
+        "indexId": 12
      }
    },
    {
--- a/packages/evaluation/page-cases/inspect/taobao.json
+++ b/packages/evaluation/page-cases/inspect/taobao.json
@ -6,13 +6,14 @@
      "multi": false,
      "annotation_index_id": 1,
      "response_rect": {
-        "left": 319,
-        "top": 54,
-        "width": 533,
-        "height": 36
+        "left": 329,
+        "top": 56,
+        "width": 457,
+        "height": 41
      },
      "response_element": {
-        "id": "hlefc"
+        "id": "jfjah",
+        "indexId": 27
      }
    },
    {
@ -20,10 +21,10 @@
      "multi": false,
      "annotation_index_id": 2,
      "response_rect": {
-        "left": 789,
-        "top": 56,
-        "width": 45,
-        "height": 32
+        "left": 786,
+        "top": 58,
+        "width": 64,
+        "height": 39
      },
      "response_element": {
        "id": "ondpi",
@ -32,30 +33,30 @@
    },
    {
      "prompt": "产品分类里面的：男鞋（文字）",
-      "searchArea": "产品分类里面的：男鞋（文字）",
+      "deepThink": true,
      "multi": false,
      "annotation_index_id": 3,
      "response_rect": {
-        "left": 109,
-        "top": 429,
-        "width": 24,
-        "height": 15
+        "left": 137,
+        "top": 416,
+        "width": 36,
+        "height": 20
      },
      "response_element": {
-        "id": "hgioh",
-        "indexId": 98
+        "id": "cjfcl",
+        "indexId": 99
      }
    },
    {
      "prompt": "右侧“立即登录”下方的收藏夹 icon",
-      "searchArea": "右侧“立即登录”下方的一排 icon",
+      "deepThink": true,
      "multi": false,
      "annotation_index_id": 4,
      "response_rect": {
-        "left": 1064,
-        "top": 383,
-        "width": 21,
-        "height": 22
+        "left": 1056,
+        "top": 386,
+        "width": 32,
+        "height": 28
      },
      "response_element": {
        "id": "fkfdl",
@ -64,33 +65,32 @@
    },
    {
      "prompt": "最右侧五个悬浮按钮的第二个",
-      "searchArea": "最右侧有一列悬浮按钮",
+      "deepThink": true,
      "multi": false,
      "annotation_index_id": 5,
      "response_rect": {
        "left": 1253,
-        "top": 355,
-        "width": 22,
-        "height": 22
+        "top": 366,
+        "width": 26,
+        "height": 32
      },
      "response_element": {
-        "id": "iegkg",
-        "indexId": 212
+        "id": "aodmc"
      }
    },
    {
      "prompt": "购物车 icon",
-      "searchArea": "顶部工具栏",
+      "deepThink": true,
      "response_rect": {
-        "left": 837,
-        "top": 10,
-        "width": 15,
-        "height": 16
+        "left": 1010,
+        "top": 390,
+        "width": 32,
+        "height": 28
      },
      "annotation_index_id": 6,
      "response_element": {
-        "id": "aefln",
-        "indexId": 12
+        "id": "nkpom",
+        "indexId": 188
      }
    }
  ]
--- a/packages/evaluation/tests/llm-locator.test.ts
+++ b/packages/evaluation/tests/llm-locator.test.ts
@ -69,7 +69,7 @@ testSources.forEach((source) => {

        const result = await insight.locate({
          prompt,
-          searchArea: testCase.searchArea,
+          deepThink: testCase.deepThink,
        });
        const { element, rect } = result;

--- a/packages/evaluation/tests/util.ts
+++ b/packages/evaluation/tests/util.ts
@ -13,7 +13,7 @@ export const repeatTime = 1;

 export type TestCase = {
  prompt: string;
-  searchArea?: string;
+  deepThink?: boolean;
  log?: string;
  response_element?: { id: string; indexId?: number };
  response_rect?: Rect;