mirror of
https://github.com/web-infra-dev/midscene.git
synced 2026-01-08 13:10:30 +00:00
chore(core): update bbox preprocessing of ui-tars (#716)
This commit is contained in:
parent
b9ff80a0db
commit
fba2c00aec
@ -97,7 +97,7 @@ export function adaptQwenBbox(
|
||||
}
|
||||
|
||||
export function adaptDoubaoBbox(
|
||||
bbox: number[] | string,
|
||||
bbox: string[] | number[] | string,
|
||||
width: number,
|
||||
height: number,
|
||||
): [number, number, number, number] {
|
||||
@ -127,42 +127,65 @@ export function adaptDoubaoBbox(
|
||||
bbox = bbox[0];
|
||||
}
|
||||
|
||||
if (bbox.length === 4 || bbox.length === 5) {
|
||||
let bboxList: number[] = [];
|
||||
if (Array.isArray(bbox) && typeof bbox[0] === 'string') {
|
||||
bbox.forEach((item) => {
|
||||
if (typeof item === 'string' && item.includes(',')) {
|
||||
const [x, y] = item.split(',');
|
||||
bboxList.push(Number(x.trim()), Number(y.trim()));
|
||||
} else if (typeof item === 'string' && item.includes(' ')) {
|
||||
const [x, y] = item.split(' ');
|
||||
bboxList.push(Number(x.trim()), Number(y.trim()));
|
||||
} else {
|
||||
bboxList.push(Number(item));
|
||||
}
|
||||
});
|
||||
} else {
|
||||
bboxList = bbox as any;
|
||||
}
|
||||
|
||||
if (bboxList.length === 4 || bboxList.length === 5) {
|
||||
return [
|
||||
Math.round((bbox[0] * width) / 1000),
|
||||
Math.round((bbox[1] * height) / 1000),
|
||||
Math.round((bbox[2] * width) / 1000),
|
||||
Math.round((bbox[3] * height) / 1000),
|
||||
Math.round((bboxList[0] * width) / 1000),
|
||||
Math.round((bboxList[1] * height) / 1000),
|
||||
Math.round((bboxList[2] * width) / 1000),
|
||||
Math.round((bboxList[3] * height) / 1000),
|
||||
];
|
||||
}
|
||||
|
||||
// treat the bbox as a center point
|
||||
if (
|
||||
bbox.length === 6 ||
|
||||
bbox.length === 2 ||
|
||||
bbox.length === 3 ||
|
||||
bbox.length === 7
|
||||
bboxList.length === 6 ||
|
||||
bboxList.length === 2 ||
|
||||
bboxList.length === 3 ||
|
||||
bboxList.length === 7
|
||||
) {
|
||||
return [
|
||||
Math.max(0, Math.round((bbox[0] * width) / 1000) - defaultBboxSize / 2),
|
||||
Math.max(0, Math.round((bbox[1] * height) / 1000) - defaultBboxSize / 2),
|
||||
Math.max(
|
||||
0,
|
||||
Math.round((bboxList[0] * width) / 1000) - defaultBboxSize / 2,
|
||||
),
|
||||
Math.max(
|
||||
0,
|
||||
Math.round((bboxList[1] * height) / 1000) - defaultBboxSize / 2,
|
||||
),
|
||||
Math.min(
|
||||
width,
|
||||
Math.round((bbox[0] * width) / 1000) + defaultBboxSize / 2,
|
||||
Math.round((bboxList[0] * width) / 1000) + defaultBboxSize / 2,
|
||||
),
|
||||
Math.min(
|
||||
height,
|
||||
Math.round((bbox[1] * height) / 1000) + defaultBboxSize / 2,
|
||||
Math.round((bboxList[1] * height) / 1000) + defaultBboxSize / 2,
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
if (bbox.length === 8) {
|
||||
return [
|
||||
Math.round((bbox[0] * width) / 1000),
|
||||
Math.round((bbox[1] * height) / 1000),
|
||||
Math.round((bbox[4] * width) / 1000),
|
||||
Math.round((bbox[5] * height) / 1000),
|
||||
Math.round((bboxList[0] * width) / 1000),
|
||||
Math.round((bboxList[1] * height) / 1000),
|
||||
Math.round((bboxList[4] * width) / 1000),
|
||||
Math.round((bboxList[5] * height) / 1000),
|
||||
];
|
||||
}
|
||||
|
||||
|
||||
@ -31,6 +31,7 @@ import {
|
||||
getAIConfig,
|
||||
getAIConfigInBoolean,
|
||||
getAIConfigInJson,
|
||||
uiTarsModelVersion,
|
||||
vlLocateMode,
|
||||
} from '@midscene/shared/env';
|
||||
import { enableDebug, getDebug } from '@midscene/shared/logger';
|
||||
@ -285,7 +286,7 @@ export async function call(
|
||||
}
|
||||
|
||||
debugProfileStats(
|
||||
`model, ${model}, mode, ${vlLocateMode() || 'default'}, prompt-tokens, ${result.usage?.prompt_tokens || ''}, completion-tokens, ${result.usage?.completion_tokens || ''}, total-tokens, ${result.usage?.total_tokens || ''}, cost-ms, ${Date.now() - startTime}, requestId, ${result._request_id || ''}`,
|
||||
`model, ${model}, mode, ${vlLocateMode() || 'default'}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ''}, completion-tokens, ${result.usage?.completion_tokens || ''}, total-tokens, ${result.usage?.total_tokens || ''}, cost-ms, ${Date.now() - startTime}, requestId, ${result._request_id || ''}`,
|
||||
);
|
||||
|
||||
debugProfileDetail(`model usage detail: ${JSON.stringify(result.usage)}`);
|
||||
|
||||
@ -385,6 +385,30 @@ describe('doubao-vision', () => {
|
||||
]
|
||||
`);
|
||||
});
|
||||
|
||||
it('adaptDoubaoBbox with string bbox', () => {
|
||||
const result = adaptDoubaoBbox(['123 222', '789 100'], 1000, 2000);
|
||||
expect(result).toMatchInlineSnapshot(`
|
||||
[
|
||||
123,
|
||||
444,
|
||||
789,
|
||||
200,
|
||||
]
|
||||
`);
|
||||
});
|
||||
|
||||
it('adaptDoubaoBbox with string bbox', () => {
|
||||
const result = adaptDoubaoBbox(['123,222', '789, 100'], 1000, 2000);
|
||||
expect(result).toMatchInlineSnapshot(`
|
||||
[
|
||||
123,
|
||||
444,
|
||||
789,
|
||||
200,
|
||||
]
|
||||
`);
|
||||
});
|
||||
});
|
||||
|
||||
describe('doubao-vision', () => {
|
||||
@ -415,6 +439,32 @@ describe('doubao-vision', () => {
|
||||
const input6 = '123 345 11111';
|
||||
const result6 = preprocessDoubaoBboxJson(input6);
|
||||
expect(result6).toMatchInlineSnapshot(`"123 345 11111"`);
|
||||
|
||||
const input7 = `
|
||||
{
|
||||
"bbox": [
|
||||
"550 216",
|
||||
"550 216",
|
||||
"550 216",
|
||||
"550 216"
|
||||
],
|
||||
"errors": []
|
||||
}
|
||||
`;
|
||||
const result7 = preprocessDoubaoBboxJson(input7);
|
||||
expect(result7).toMatchInlineSnapshot(`
|
||||
"
|
||||
{
|
||||
"bbox": [
|
||||
"550,216",
|
||||
"550,216",
|
||||
"550,216",
|
||||
"550,216"
|
||||
],
|
||||
"errors": []
|
||||
}
|
||||
"
|
||||
`);
|
||||
});
|
||||
|
||||
it('adaptDoubaoBbox with 2 points', () => {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user