fix(llm): coords offset in vl locator (#545)

This commit is contained in:
yuyutaotao 2025-04-08 17:56:15 +08:00 committed by GitHub
parent 8582f86793
commit 732f605144
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 269 additions and 166 deletions

View File

@ -13,6 +13,7 @@ import {
import { vlLocateMode } from '@/env';
import type { PlanningLocateParam } from '@/types';
import { getDebug } from '@midscene/shared/logger';
export type AIArgs = [
ChatCompletionSystemMessageParam,
@ -43,6 +44,7 @@ export async function callAiFn<T>(
}
const defaultBboxSize = 20; // must be even number
const debugInspectUtils = getDebug('ai:common');
// transform the param of locate from qwen mode
export function fillLocateParam(
@ -110,7 +112,12 @@ export function adaptDoubaoBbox(
}
// treat the bbox as a center point
if (bbox.length === 6 || bbox.length === 2) {
if (
bbox.length === 6 ||
bbox.length === 2 ||
bbox.length === 3 ||
bbox.length === 7
) {
return [
Math.max(0, Math.round((bbox[0] * width) / 1000) - defaultBboxSize / 2),
Math.max(0, Math.round((bbox[1] * height) / 1000) - defaultBboxSize / 2),
@ -161,6 +168,15 @@ export function adaptBboxToRect(
offsetY = 0,
errorMsg?: string,
): Rect {
debugInspectUtils(
'adaptBboxToRect',
bbox,
width,
height,
offsetX,
offsetY,
errorMsg || '',
);
const [left, top, right, bottom] = adaptBbox(bbox, width, height, errorMsg);
return {
left: left + offsetX,

View File

@ -9,7 +9,6 @@ export {
AiExtractElementInfo,
AiAssert,
AiLocateSection,
transformElementPositionToId,
} from './inspect';
export { plan } from './llm-planning';

View File

@ -2,6 +2,7 @@ import {
MIDSCENE_USE_QWEN_VL,
MIDSCENE_USE_VLM_UI_TARS,
getAIConfigInBoolean,
vlLocateMode,
} from '@/env';
import type {
AIAssertionResponse,
@ -64,114 +65,6 @@ const liteContextConfig = {
const debugInspect = getDebug('ai:inspect');
const debugSection = getDebug('ai:section');
function transformToAbsoluteCoords(
relativePosition: { x: number; y: number },
size: Size,
) {
return {
x: Number(((relativePosition.x / 1000) * size.width).toFixed(3)),
y: Number(((relativePosition.y / 1000) * size.height).toFixed(3)),
};
}
// let index = 0;
export async function transformElementPositionToId(
aiResult: AIElementResponse | [number, number],
treeRoot: ElementTreeNode<BaseElement>,
size: { width: number; height: number },
searchAreaRect: Rect | undefined,
insertElementByPosition: (position: { x: number; y: number }) => BaseElement,
) {
const emptyResponse: AIElementResponse = {
errors: [],
elements: [],
};
const elementAtPosition = (center: { x: number; y: number }) => {
const element = elementByPositionWithElementInfo(treeRoot, center);
const distanceToCenter = element
? distance({ x: element.center[0], y: element.center[1] }, center)
: 0;
return distanceToCenter <= distanceThreshold ? element : undefined;
};
if ('bbox' in aiResult) {
if (
!Array.isArray(aiResult.bbox) ||
(aiResult.bbox as number[]).length !== 4
) {
return emptyResponse;
}
const bbox: [number, number, number, number] = [
aiResult.bbox[0] + (searchAreaRect?.left || 0),
aiResult.bbox[1] + (searchAreaRect?.top || 0),
aiResult.bbox[2] + (searchAreaRect?.left || 0),
aiResult.bbox[3] + (searchAreaRect?.top || 0),
];
const centerX = Math.round((bbox[0] + bbox[2]) / 2);
const centerY = Math.round((bbox[1] + bbox[3]) / 2);
let element = elementAtPosition({ x: centerX, y: centerY });
if (!element) {
element = insertElementByPosition({
x: centerX,
y: centerY,
});
}
assert(
element,
`inspect: no element found with coordinates: ${JSON.stringify(bbox)}`,
);
return {
errors: [],
elements: [
{
id: element.id,
},
],
bbox,
};
}
if (Array.isArray(aiResult)) {
// [number, number] coord
const relativePosition = aiResult;
const absolutePosition = transformToAbsoluteCoords(
{
x: relativePosition[0],
y: relativePosition[1],
},
size,
);
let element = elementAtPosition(absolutePosition);
if (!element) {
element = insertElementByPosition(absolutePosition);
}
assert(
element,
`inspect: no id found with position: ${JSON.stringify({ absolutePosition })}`,
);
return {
errors: [],
elements: [
{
id: element.id,
},
],
};
}
return {
errors: aiResult.errors,
elements: aiResult.elements,
};
}
function matchQuickAnswer(
quickAnswer:
| Partial<AISingleElementResponse>
@ -258,7 +151,7 @@ export async function AiLocateElement<
pageDescription: description,
targetElementDescription,
});
const systemPrompt = systemPromptToLocateElement();
const systemPrompt = systemPromptToLocateElement(!!vlLocateMode());
let imagePayload = screenshotBase64WithElementMarker || screenshotBase64;
@ -305,32 +198,51 @@ export async function AiLocateElement<
const rawResponse = JSON.stringify(res.content);
let resRect: Rect | undefined;
let matchedElements: AIElementLocatorResponse['elements'] =
'elements' in res.content ? res.content.elements : [];
let errors: AIElementLocatorResponse['errors'] | undefined =
'errors' in res.content ? res.content.errors : [];
if ('bbox' in res.content && Array.isArray(res.content.bbox)) {
const errorMsg = res.content.errors?.length
? `Failed to parse bbox: ${res.content.errors?.join(',')}`
: '';
resRect = adaptBboxToRect(
res.content.bbox,
context.size.width,
context.size.height,
options.searchConfig?.rect?.width || context.size.width,
options.searchConfig?.rect?.height || context.size.height,
options.searchConfig?.rect?.left,
options.searchConfig?.rect?.top,
errorMsg,
);
debugInspect('resRect', resRect);
}
const parseResult = await transformElementPositionToId(
res.content,
context.tree,
size,
options.searchConfig?.rect,
insertElementByPosition,
);
const rectCenter = {
x: resRect.left + resRect.width / 2,
y: resRect.top + resRect.height / 2,
};
let element = elementByPositionWithElementInfo(context.tree, rectCenter);
const distanceToCenter = element
? distance({ x: element.center[0], y: element.center[1] }, rectCenter)
: 0;
if (!element || distanceToCenter > distanceThreshold) {
element = insertElementByPosition(rectCenter);
}
if (element) {
matchedElements = [element];
errors = [];
}
}
return {
rect: resRect,
parseResult,
parseResult: {
elements: matchedElements,
errors,
},
rawResponse,
elementById,
usage: res.usage,
@ -394,7 +306,7 @@ export async function AiLocateSection(options: {
debugSection('referenceBboxList %j', referenceBboxList);
const referenceRects = referenceBboxList
.filter((bbox) => Array.isArray(bbox) && bbox.length === 4)
.filter((bbox) => Array.isArray(bbox))
.map((bbox) => {
return adaptBboxToRect(bbox, context.size.width, context.size.height);
});

View File

@ -1,9 +1,8 @@
import { vlLocateMode } from '@/env';
import { PromptTemplate } from '@langchain/core/prompts';
import type { ResponseFormatJSONSchema } from 'openai/resources';
export function systemPromptToLocateElement() {
if (vlLocateMode()) {
export function systemPromptToLocateElement(vlMode: boolean) {
if (vlMode) {
return `
## Role:
You are an expert in software testing.
@ -15,7 +14,7 @@ You are an expert in software testing.
## Output Format:
\`\`\`json
{
"bbox": [number, number, number, number], // top-left x, top-left y, bottom-right x, bottom-right y
"bbox": [number, number, number, number], // left, top, right, bottom
"errors"?: string[]
}
\`\`\`

View File

@ -20,6 +20,8 @@ return in this JSON format:
"error"?: string
}
\`\`\`
In which, all the numbers in the \`bbox\` means the distance to the left, top, right, bottom of the page.
`;
}

View File

@ -4,7 +4,6 @@ import { getLogDirByType, getVersion, setLogDir } from './utils';
export {
plan,
transformElementPositionToId,
describeUserPage,
AiLocateElement,
AiAssert,

View File

@ -1,5 +1,166 @@
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
exports[`system prompts > locator - 4o 1`] = `
"
## Role:
You are an expert in software page image (2D) and page element text analysis.
## Objective:
- Identify elements in screenshots and text that match the user's description.
- Return JSON data containing the selection reason and element ID.
## Skills:
- Image analysis and recognition
- Multilingual text understanding
- Software UI design and testing
## Workflow:
1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
3. Found the required number of elements
4. Return JSON data containing the selection reason and element ID.
## Constraints:
- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
- Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
- If no elements are found, the "elements" array should be empty.
- The returned data must conform to the specified JSON format.
- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
## Output Format:
Please return the result in JSON format as follows:
\`\`\`json
{
"elements": [
// If no matching elements are found, return an empty array []
{
"reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
"id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
}
// More elements...
],
"errors": [] // Array of strings containing any error messages
}
\`\`\`
## Example:
Example 1:
Input Example:
\`\`\`json
// Description: "Shopping cart icon in the upper right corner"
{
"description": "PLACEHOLDER", // Description of the target element
"screenshot": "path/screenshot.png",
"text": '{
"pageSize": {
"width": 400, // Width of the page
"height": 905 // Height of the page
},
"elementInfos": [
{
"id": "1231", // ID of the element
"indexId": "0", // Index of the elementThe image is labeled to the left of the element
"attributes": { // Attributes of the element
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
"src": "https://ap-southeast-3.m",
"class": ".img"
},
"content": "", // Text content of the element
"rect": {
"left": 280, // Distance from the left side of the page
"top": 8, // Distance from the top of the page
"width": 44, // Width of the element
"height": 44 // Height of the element
}
},
{
"id": "66551", // ID of the element
"indexId": "1", // Index of the element,The image is labeled to the left of the element
"attributes": { // Attributes of the element
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
"src": "data:image/png;base64,iVBORw0KGgoAAAANSU...",
"class": ".icon"
},
"content": "", // Text content of the element
"rect": {
"left": 350, // Distance from the left side of the page
"top": 16, // Distance from the top of the page
"width": 25, // Width of the element
"height": 25 // Height of the element
}
},
...
{
"id": "12344",
"indexId": "2", // Index of the elementThe image is labeled to the left of the element
"attributes": {
"nodeType": "TEXT Node",
"class": ".product-name"
},
"center": [
288,
834
],
"content": "Mango Drink",
"rect": {
"left": 188,
"top": 827,
"width": 199,
"height": 13
}
},
...
]
}
'
}
\`\`\`
Output Example:
\`\`\`json
{
"elements": [
{
// Describe the reason for finding this element, replace with actual value in practice
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
"text": "",
// ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
"id": "1231"
}
],
"errors": []
}
\`\`\`
"
`;
exports[`system prompts > locator - qwen 1`] = `
"
## Role:
You are an expert in software testing.
## Objective:
- Identify elements in screenshots and text that match the user's description.
- Give the coordinates of the element that matches the user's description best in the screenshot.
## Output Format:
\`\`\`json
{
"bbox": [number, number, number, number], // left, top, right, bottom
"errors"?: string[]
}
\`\`\`
Fields:
* \`bbox\` is the bounding box of the element that matches the user's description best in the screenshot
* \`errors\` is an optional array of error messages (if any)
"
`;
exports[`system prompts > planning - 4o - response format 1`] = `
{
"json_schema": {
@ -389,5 +550,7 @@ return in this JSON format:
"error"?: string
}
\`\`\`
In which, all the numbers in the \`bbox\` means the distance to the left, top, right, bottom of the page.
"
`;

View File

@ -1,3 +1,4 @@
import { systemPromptToLocateElement } from '@/ai-model';
import {
automationUserPrompt,
generateTaskBackgroundContext,
@ -62,4 +63,14 @@ describe('system prompts', () => {
const prompt = systemPromptToLocateSection();
expect(prompt).toMatchSnapshot();
});
it('locator - 4o', () => {
const prompt = systemPromptToLocateElement(false);
expect(prompt).toMatchSnapshot();
});
it('locator - qwen', () => {
const prompt = systemPromptToLocateElement(true);
expect(prompt).toMatchSnapshot();
});
});

View File

@ -42,7 +42,8 @@
"height": 49
},
"response_element": {
"id": "fcgao"
"id": "mfodf",
"indexId": 10
}
},
{
@ -56,7 +57,8 @@
"height": 50
},
"response_element": {
"id": "jgnil"
"id": "nhbof",
"indexId": 12
}
},
{

View File

@ -6,13 +6,14 @@
"multi": false,
"annotation_index_id": 1,
"response_rect": {
"left": 319,
"top": 54,
"width": 533,
"height": 36
"left": 329,
"top": 56,
"width": 457,
"height": 41
},
"response_element": {
"id": "hlefc"
"id": "jfjah",
"indexId": 27
}
},
{
@ -20,10 +21,10 @@
"multi": false,
"annotation_index_id": 2,
"response_rect": {
"left": 789,
"top": 56,
"width": 45,
"height": 32
"left": 786,
"top": 58,
"width": 64,
"height": 39
},
"response_element": {
"id": "ondpi",
@ -32,30 +33,30 @@
},
{
"prompt": "产品分类里面的:男鞋(文字)",
"searchArea": "产品分类里面的:男鞋(文字)",
"deepThink": true,
"multi": false,
"annotation_index_id": 3,
"response_rect": {
"left": 109,
"top": 429,
"width": 24,
"height": 15
"left": 137,
"top": 416,
"width": 36,
"height": 20
},
"response_element": {
"id": "hgioh",
"indexId": 98
"id": "cjfcl",
"indexId": 99
}
},
{
"prompt": "右侧“立即登录”下方的收藏夹 icon",
"searchArea": "右侧“立即登录”下方的一排 icon",
"deepThink": true,
"multi": false,
"annotation_index_id": 4,
"response_rect": {
"left": 1064,
"top": 383,
"width": 21,
"height": 22
"left": 1056,
"top": 386,
"width": 32,
"height": 28
},
"response_element": {
"id": "fkfdl",
@ -64,33 +65,32 @@
},
{
"prompt": "最右侧五个悬浮按钮的第二个",
"searchArea": "最右侧有一列悬浮按钮",
"deepThink": true,
"multi": false,
"annotation_index_id": 5,
"response_rect": {
"left": 1253,
"top": 355,
"width": 22,
"height": 22
"top": 366,
"width": 26,
"height": 32
},
"response_element": {
"id": "iegkg",
"indexId": 212
"id": "aodmc"
}
},
{
"prompt": "购物车 icon",
"searchArea": "顶部工具栏",
"deepThink": true,
"response_rect": {
"left": 837,
"top": 10,
"width": 15,
"height": 16
"left": 1010,
"top": 390,
"width": 32,
"height": 28
},
"annotation_index_id": 6,
"response_element": {
"id": "aefln",
"indexId": 12
"id": "nkpom",
"indexId": 188
}
}
]

View File

@ -69,7 +69,7 @@ testSources.forEach((source) => {
const result = await insight.locate({
prompt,
searchArea: testCase.searchArea,
deepThink: testCase.deepThink,
});
const { element, rect } = result;

View File

@ -13,7 +13,7 @@ export const repeatTime = 1;
export type TestCase = {
prompt: string;
searchArea?: string;
deepThink?: boolean;
log?: string;
response_element?: { id: string; indexId?: number };
response_rect?: Rect;