chore(core): update bbox preprocessing of ui-tars (#716)

This commit is contained in:
yuyutaotao 2025-05-16 13:22:57 +08:00 committed by GitHub
parent b9ff80a0db
commit fba2c00aec
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 93 additions and 19 deletions

View File

@ -97,7 +97,7 @@ export function adaptQwenBbox(
}
export function adaptDoubaoBbox(
bbox: number[] | string,
bbox: string[] | number[] | string,
width: number,
height: number,
): [number, number, number, number] {
@ -127,42 +127,65 @@ export function adaptDoubaoBbox(
bbox = bbox[0];
}
if (bbox.length === 4 || bbox.length === 5) {
let bboxList: number[] = [];
if (Array.isArray(bbox) && typeof bbox[0] === 'string') {
bbox.forEach((item) => {
if (typeof item === 'string' && item.includes(',')) {
const [x, y] = item.split(',');
bboxList.push(Number(x.trim()), Number(y.trim()));
} else if (typeof item === 'string' && item.includes(' ')) {
const [x, y] = item.split(' ');
bboxList.push(Number(x.trim()), Number(y.trim()));
} else {
bboxList.push(Number(item));
}
});
} else {
bboxList = bbox as any;
}
if (bboxList.length === 4 || bboxList.length === 5) {
return [
Math.round((bbox[0] * width) / 1000),
Math.round((bbox[1] * height) / 1000),
Math.round((bbox[2] * width) / 1000),
Math.round((bbox[3] * height) / 1000),
Math.round((bboxList[0] * width) / 1000),
Math.round((bboxList[1] * height) / 1000),
Math.round((bboxList[2] * width) / 1000),
Math.round((bboxList[3] * height) / 1000),
];
}
// treat the bbox as a center point
if (
bbox.length === 6 ||
bbox.length === 2 ||
bbox.length === 3 ||
bbox.length === 7
bboxList.length === 6 ||
bboxList.length === 2 ||
bboxList.length === 3 ||
bboxList.length === 7
) {
return [
Math.max(0, Math.round((bbox[0] * width) / 1000) - defaultBboxSize / 2),
Math.max(0, Math.round((bbox[1] * height) / 1000) - defaultBboxSize / 2),
Math.max(
0,
Math.round((bboxList[0] * width) / 1000) - defaultBboxSize / 2,
),
Math.max(
0,
Math.round((bboxList[1] * height) / 1000) - defaultBboxSize / 2,
),
Math.min(
width,
Math.round((bbox[0] * width) / 1000) + defaultBboxSize / 2,
Math.round((bboxList[0] * width) / 1000) + defaultBboxSize / 2,
),
Math.min(
height,
Math.round((bbox[1] * height) / 1000) + defaultBboxSize / 2,
Math.round((bboxList[1] * height) / 1000) + defaultBboxSize / 2,
),
];
}
if (bbox.length === 8) {
return [
Math.round((bbox[0] * width) / 1000),
Math.round((bbox[1] * height) / 1000),
Math.round((bbox[4] * width) / 1000),
Math.round((bbox[5] * height) / 1000),
Math.round((bboxList[0] * width) / 1000),
Math.round((bboxList[1] * height) / 1000),
Math.round((bboxList[4] * width) / 1000),
Math.round((bboxList[5] * height) / 1000),
];
}

View File

@ -31,6 +31,7 @@ import {
getAIConfig,
getAIConfigInBoolean,
getAIConfigInJson,
uiTarsModelVersion,
vlLocateMode,
} from '@midscene/shared/env';
import { enableDebug, getDebug } from '@midscene/shared/logger';
@ -285,7 +286,7 @@ export async function call(
}
debugProfileStats(
`model, ${model}, mode, ${vlLocateMode() || 'default'}, prompt-tokens, ${result.usage?.prompt_tokens || ''}, completion-tokens, ${result.usage?.completion_tokens || ''}, total-tokens, ${result.usage?.total_tokens || ''}, cost-ms, ${Date.now() - startTime}, requestId, ${result._request_id || ''}`,
`model, ${model}, mode, ${vlLocateMode() || 'default'}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ''}, completion-tokens, ${result.usage?.completion_tokens || ''}, total-tokens, ${result.usage?.total_tokens || ''}, cost-ms, ${Date.now() - startTime}, requestId, ${result._request_id || ''}`,
);
debugProfileDetail(`model usage detail: ${JSON.stringify(result.usage)}`);

View File

@ -385,6 +385,30 @@ describe('doubao-vision', () => {
]
`);
});
it('adaptDoubaoBbox with string bbox', () => {
const result = adaptDoubaoBbox(['123 222', '789 100'], 1000, 2000);
expect(result).toMatchInlineSnapshot(`
[
123,
444,
789,
200,
]
`);
});
it('adaptDoubaoBbox with string bbox', () => {
const result = adaptDoubaoBbox(['123,222', '789, 100'], 1000, 2000);
expect(result).toMatchInlineSnapshot(`
[
123,
444,
789,
200,
]
`);
});
});
describe('doubao-vision', () => {
@ -415,6 +439,32 @@ describe('doubao-vision', () => {
const input6 = '123 345 11111';
const result6 = preprocessDoubaoBboxJson(input6);
expect(result6).toMatchInlineSnapshot(`"123 345 11111"`);
const input7 = `
{
"bbox": [
"550 216",
"550 216",
"550 216",
"550 216"
],
"errors": []
}
`;
const result7 = preprocessDoubaoBboxJson(input7);
expect(result7).toMatchInlineSnapshot(`
"
{
"bbox": [
"550,216",
"550,216",
"550,216",
"550,216"
],
"errors": []
}
"
`);
});
it('adaptDoubaoBbox with 2 points', () => {