fix(core): keep context in log file when locate fails (#597)

This commit is contained in:
yuyutaotao 2025-04-21 13:56:30 +08:00 committed by GitHub
parent 10efa40c8b
commit 138864e6b1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 66 additions and 68 deletions

View File

@ -51,7 +51,6 @@ export function fillLocateParam(
locate: PlanningLocateParam,
width: number,
height: number,
errorMsg?: string,
) {
// The Qwen model might have hallucinations of naming bbox as bbox_2d.
if ((locate as any).bbox_2d && !locate?.bbox) {
@ -61,7 +60,7 @@ export function fillLocateParam(
}
if (locate?.bbox) {
locate.bbox = adaptBbox(locate.bbox, width, height, errorMsg);
locate.bbox = adaptBbox(locate.bbox, width, height);
}
return locate;
@ -69,12 +68,9 @@ export function fillLocateParam(
export function adaptQwenBbox(
bbox: number[],
errorMsg?: string,
): [number, number, number, number] {
if (bbox.length < 2) {
const msg =
errorMsg ||
`invalid bbox data for qwen-vl mode: ${JSON.stringify(bbox)} `;
const msg = `invalid bbox data for qwen-vl mode: ${JSON.stringify(bbox)} `;
throw new Error(msg);
}
@ -95,7 +91,6 @@ export function adaptDoubaoBbox(
bbox: number[] | string,
width: number,
height: number,
errorMsg?: string,
): [number, number, number, number] {
assert(
width > 0 && height > 0,
@ -158,9 +153,7 @@ export function adaptDoubaoBbox(
];
}
const msg =
errorMsg ||
`invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;
const msg = `invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;
throw new Error(msg);
}
@ -168,13 +161,12 @@ export function adaptBbox(
bbox: number[],
width: number,
height: number,
errorMsg?: string,
): [number, number, number, number] {
if (vlLocateMode() === 'doubao-vision' || vlLocateMode() === 'vlm-ui-tars') {
return adaptDoubaoBbox(bbox, width, height, errorMsg);
return adaptDoubaoBbox(bbox, width, height);
}
return adaptQwenBbox(bbox, errorMsg);
return adaptQwenBbox(bbox);
}
export function adaptBboxToRect(
@ -183,18 +175,9 @@ export function adaptBboxToRect(
height: number,
offsetX = 0,
offsetY = 0,
errorMsg?: string,
): Rect {
debugInspectUtils(
'adaptBboxToRect',
bbox,
width,
height,
offsetX,
offsetY,
errorMsg || '',
);
const [left, top, right, bottom] = adaptBbox(bbox, width, height, errorMsg);
debugInspectUtils('adaptBboxToRect', bbox, width, height, offsetX, offsetY);
const [left, top, right, bottom] = adaptBbox(bbox, width, height);
return {
left: left + offsetX,
top: top + offsetY,

View File

@ -202,38 +202,45 @@ export async function AiLocateElement<
'elements' in res.content ? res.content.elements : [];
let errors: AIElementLocatorResponse['errors'] | undefined =
'errors' in res.content ? res.content.errors : [];
if ('bbox' in res.content && Array.isArray(res.content.bbox)) {
const errorMsg = res.content.errors?.length
? `Failed to parse bbox: ${res.content.errors?.join(',')}`
: '';
try {
if ('bbox' in res.content && Array.isArray(res.content.bbox)) {
resRect = adaptBboxToRect(
res.content.bbox,
options.searchConfig?.rect?.width || context.size.width,
options.searchConfig?.rect?.height || context.size.height,
options.searchConfig?.rect?.left,
options.searchConfig?.rect?.top,
);
debugInspect('resRect', resRect);
resRect = adaptBboxToRect(
res.content.bbox,
options.searchConfig?.rect?.width || context.size.width,
options.searchConfig?.rect?.height || context.size.height,
options.searchConfig?.rect?.left,
options.searchConfig?.rect?.top,
errorMsg,
);
debugInspect('resRect', resRect);
const rectCenter = {
x: resRect.left + resRect.width / 2,
y: resRect.top + resRect.height / 2,
};
let element = elementByPositionWithElementInfo(context.tree, rectCenter);
const rectCenter = {
x: resRect.left + resRect.width / 2,
y: resRect.top + resRect.height / 2,
};
let element = elementByPositionWithElementInfo(context.tree, rectCenter);
const distanceToCenter = element
? distance({ x: element.center[0], y: element.center[1] }, rectCenter)
: 0;
const distanceToCenter = element
? distance({ x: element.center[0], y: element.center[1] }, rectCenter)
: 0;
if (!element || distanceToCenter > distanceThreshold) {
element = insertElementByPosition(rectCenter);
}
if (!element || distanceToCenter > distanceThreshold) {
element = insertElementByPosition(rectCenter);
if (element) {
matchedElements = [element];
errors = [];
}
}
if (element) {
matchedElements = [element];
errors = [];
} catch (e) {
const msg =
e instanceof Error
? `Failed to parse bbox: ${e.message}`
: 'unknown error in locate';
if (!errors || errors?.length === 0) {
errors = [msg];
} else {
errors.push(`(${msg})`);
}
}

View File

@ -86,12 +86,22 @@ export async function plan(
if (vlLocateMode()) {
actions.forEach((action) => {
if (action.locate) {
action.locate = fillLocateParam(
action.locate,
size.width,
size.height,
planFromAI.error,
);
try {
action.locate = fillLocateParam(
action.locate,
size.width,
size.height,
);
} catch (e) {
throw new Error(
`Failed to fill locate param: ${planFromAI.error} (${
e instanceof Error ? e.message : 'unknown error'
})`,
{
cause: e,
},
);
}
}
});
// in Qwen-VL, error means error. In GPT-4o, error may mean more actions are needed.

View File

@ -138,7 +138,7 @@ export default class Insight<
quickAnswer: opt?.quickAnswer,
searchConfig: searchAreaResponse,
});
// const parseResult = await this.aiVendorFn<AIElementParseResponse>(msgs);
const timeCost = Date.now() - startTime;
const taskInfo: InsightTaskInfo = {
...(this.taskInfo ? this.taskInfo : {}),
@ -153,7 +153,7 @@ export default class Insight<
let errorLog: string | undefined;
if (parseResult.errors?.length) {
errorLog = `locate - AI response error: \n${parseResult.errors.join('\n')}`;
errorLog = `AI model failed to locate: \n${parseResult.errors.join('\n')}`;
}
const dumpData: PartialInsightDumpFromSDK = {

View File

@ -160,6 +160,12 @@ export class PageTaskExecutor {
const dumpCollector: DumpSubscriber = (dump) => {
insightDump = dump;
usage = dump?.taskInfo?.usage;
task.log = {
dump: insightDump,
};
task.usage = usage;
};
this.insight.onceDumpUpdatedFn = dumpCollector;
const shotTime = Date.now();
@ -170,6 +176,7 @@ export class PageTaskExecutor {
screenshot: pageContext.screenshotBase64,
timing: 'before locate',
};
task.recorder = [recordItem];
const cachePrompt = param.prompt;
const locateCache = cacheGroup?.matchCache(
@ -218,9 +225,6 @@ export class PageTaskExecutor {
});
}
if (!element) {
task.log = {
dump: insightDump,
};
throw new Error(`Element not found: ${param.prompt}`);
}
@ -229,15 +233,10 @@ export class PageTaskExecutor {
element,
},
pageContext,
log: {
dump: insightDump,
},
cache: {
hit: cacheHitFlag,
},
recorder: [recordItem],
aiCost,
usage,
};
},
};
@ -826,7 +825,6 @@ export class PageTaskExecutor {
logList.push(planResult.log);
}
// console.log('planningResult is', planResult);
if (!planResult.more_actions_needed_by_instruction) {
planningTask = null;
break;