diff --git a/packages/core/src/ai-model/common.ts b/packages/core/src/ai-model/common.ts index f0d67a154..4d721f1d0 100644 --- a/packages/core/src/ai-model/common.ts +++ b/packages/core/src/ai-model/common.ts @@ -97,7 +97,7 @@ export function adaptQwenBbox( } export function adaptDoubaoBbox( - bbox: number[] | string, + bbox: string[] | number[] | string, width: number, height: number, ): [number, number, number, number] { @@ -127,42 +127,65 @@ export function adaptDoubaoBbox( bbox = bbox[0]; } - if (bbox.length === 4 || bbox.length === 5) { + let bboxList: number[] = []; + if (Array.isArray(bbox) && typeof bbox[0] === 'string') { + bbox.forEach((item) => { + if (typeof item === 'string' && item.includes(',')) { + const [x, y] = item.split(','); + bboxList.push(Number(x.trim()), Number(y.trim())); + } else if (typeof item === 'string' && item.includes(' ')) { + const [x, y] = item.split(' '); + bboxList.push(Number(x.trim()), Number(y.trim())); + } else { + bboxList.push(Number(item)); + } + }); + } else { + bboxList = bbox as any; + } + + if (bboxList.length === 4 || bboxList.length === 5) { return [ - Math.round((bbox[0] * width) / 1000), - Math.round((bbox[1] * height) / 1000), - Math.round((bbox[2] * width) / 1000), - Math.round((bbox[3] * height) / 1000), + Math.round((bboxList[0] * width) / 1000), + Math.round((bboxList[1] * height) / 1000), + Math.round((bboxList[2] * width) / 1000), + Math.round((bboxList[3] * height) / 1000), ]; } // treat the bbox as a center point if ( - bbox.length === 6 || - bbox.length === 2 || - bbox.length === 3 || - bbox.length === 7 + bboxList.length === 6 || + bboxList.length === 2 || + bboxList.length === 3 || + bboxList.length === 7 ) { return [ - Math.max(0, Math.round((bbox[0] * width) / 1000) - defaultBboxSize / 2), - Math.max(0, Math.round((bbox[1] * height) / 1000) - defaultBboxSize / 2), + Math.max( + 0, + Math.round((bboxList[0] * width) / 1000) - defaultBboxSize / 2, + ), + Math.max( + 0, + Math.round((bboxList[1] * height) / 1000) - defaultBboxSize / 2, + ), Math.min( width, - Math.round((bbox[0] * width) / 1000) + defaultBboxSize / 2, + Math.round((bboxList[0] * width) / 1000) + defaultBboxSize / 2, ), Math.min( height, - Math.round((bbox[1] * height) / 1000) + defaultBboxSize / 2, + Math.round((bboxList[1] * height) / 1000) + defaultBboxSize / 2, ), ]; } if (bbox.length === 8) { return [ - Math.round((bbox[0] * width) / 1000), - Math.round((bbox[1] * height) / 1000), - Math.round((bbox[4] * width) / 1000), - Math.round((bbox[5] * height) / 1000), + Math.round((bboxList[0] * width) / 1000), + Math.round((bboxList[1] * height) / 1000), + Math.round((bboxList[4] * width) / 1000), + Math.round((bboxList[5] * height) / 1000), ]; } diff --git a/packages/core/src/ai-model/service-caller/index.ts b/packages/core/src/ai-model/service-caller/index.ts index aa602df78..69984fa34 100644 --- a/packages/core/src/ai-model/service-caller/index.ts +++ b/packages/core/src/ai-model/service-caller/index.ts @@ -31,6 +31,7 @@ import { getAIConfig, getAIConfigInBoolean, getAIConfigInJson, + uiTarsModelVersion, vlLocateMode, } from '@midscene/shared/env'; import { enableDebug, getDebug } from '@midscene/shared/logger'; @@ -285,7 +286,7 @@ export async function call( } debugProfileStats( - `model, ${model}, mode, ${vlLocateMode() || 'default'}, prompt-tokens, ${result.usage?.prompt_tokens || ''}, completion-tokens, ${result.usage?.completion_tokens || ''}, total-tokens, ${result.usage?.total_tokens || ''}, cost-ms, ${Date.now() - startTime}, requestId, ${result._request_id || ''}`, + `model, ${model}, mode, ${vlLocateMode() || 'default'}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ''}, completion-tokens, ${result.usage?.completion_tokens || ''}, total-tokens, ${result.usage?.total_tokens || ''}, cost-ms, ${Date.now() - startTime}, requestId, ${result._request_id || ''}`, ); debugProfileDetail(`model usage detail: ${JSON.stringify(result.usage)}`); diff --git a/packages/core/tests/unit-test/utils.test.ts b/packages/core/tests/unit-test/utils.test.ts index 8cb31a00a..4232a48c5 100644 --- a/packages/core/tests/unit-test/utils.test.ts +++ b/packages/core/tests/unit-test/utils.test.ts @@ -385,6 +385,30 @@ describe('doubao-vision', () => { ] `); }); + + it('adaptDoubaoBbox with string bbox', () => { + const result = adaptDoubaoBbox(['123 222', '789 100'], 1000, 2000); + expect(result).toMatchInlineSnapshot(` + [ + 123, + 444, + 789, + 200, + ] + `); + }); + + it('adaptDoubaoBbox with string bbox', () => { + const result = adaptDoubaoBbox(['123,222', '789, 100'], 1000, 2000); + expect(result).toMatchInlineSnapshot(` + [ + 123, + 444, + 789, + 200, + ] + `); + }); }); describe('doubao-vision', () => { @@ -415,6 +439,32 @@ describe('doubao-vision', () => { const input6 = '123 345 11111'; const result6 = preprocessDoubaoBboxJson(input6); expect(result6).toMatchInlineSnapshot(`"123 345 11111"`); + + const input7 = ` +{ + "bbox": [ + "550 216", + "550 216", + "550 216", + "550 216" + ], + "errors": [] +} + `; + const result7 = preprocessDoubaoBboxJson(input7); + expect(result7).toMatchInlineSnapshot(` + " + { + "bbox": [ + "550,216", + "550,216", + "550,216", + "550,216" + ], + "errors": [] + } + " + `); }); it('adaptDoubaoBbox with 2 points', () => {