feat(core): element describer (#748)

* feat(core): add element describer

* feat(core): add element describer

* chore(core): add test cases

* feat(core): put language settings into env

* fix(core): edge case for annotation

* chore(core): update describe settings

* chore(core): fix lint

* fix(core): remove unused cases

* feat(core): add describer widget

* feat(core): move describer to agent

* feat(core): update describer prompt

* feat(core): update describer prompt

* feat(core): add describer tool

* feat(core): add deepThink for describer

* fix(core): describer widget

* chore(core): fix lint

* docs(core): docs for MIDSCENE_PREFERRED_LANGUAGE

* feat(core): set context in locator dump
This commit is contained in:
yuyutaotao 2025-05-21 20:58:37 +08:00 committed by GitHub
parent 610e7979bf
commit 01b3576abd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
45 changed files with 925 additions and 332 deletions

View File

@ -1,8 +1,15 @@
import { PlayCircleOutlined } from '@ant-design/icons';
import type { UIContext } from '@midscene/core';
import { useStaticPageAgent } from '@midscene/visualizer';
import { Describer, useStaticPageAgent } from '@midscene/visualizer';
import type { WebUIContext } from '@midscene/web/utils';
import { Button, Drawer, Tooltip } from 'antd';
import {
Button,
ConfigProvider,
Drawer,
Tabs,
type TabsProps,
Tooltip,
} from 'antd';
import { useEffect, useState } from 'react';
import { StandardPlayground } from './playground';
import { useEnvConfig } from './store';
@ -26,6 +33,11 @@ const checkServerStatus = async () => {
}
};
const tabKeys = {
PLAYGROUND: 'playground',
ELEMENT_DESCRIBER: 'element-describer',
};
export const useServerValid = (shouldRun = true) => {
const [serverValid, setServerValid] = useState(false);
const { serviceMode } = useEnvConfig();
@ -79,6 +91,63 @@ export default function OpenInPlayground(props?: { context?: UIContext }) {
};
const agent = useStaticPageAgent(context as WebUIContext);
const tabItems: TabsProps['items'] = [
{
key: tabKeys.PLAYGROUND,
label: 'Playground',
},
...(location.href.indexOf('beta') >= 0
? [
{
key: tabKeys.ELEMENT_DESCRIBER,
label: 'Element Describer (Beta)',
},
]
: []),
];
const [activeTab, setActiveTab] = useState(tabKeys.PLAYGROUND);
let toolContent: React.ReactNode;
if (activeTab === tabKeys.PLAYGROUND) {
toolContent = (
<StandardPlayground
getAgent={() => {
return agent;
}}
dryMode={true}
hideLogo={true}
key={contextLoadingCounter}
/>
);
} else if (activeTab === tabKeys.ELEMENT_DESCRIBER) {
if (context) {
toolContent = (
<Describer uiContext={context} key={contextLoadingCounter} />
);
} else {
toolContent = <div>No context found</div>;
}
}
const tabComponent = (
<ConfigProvider
theme={{
components: {
Tabs: {
horizontalMargin: '0 0 -1px 10px',
},
},
}}
>
<Tabs
defaultActiveKey={activeTab}
items={tabItems}
onChange={setActiveTab}
/>
</ConfigProvider>
);
if (!ifPlaygroundValid) {
return (
<Tooltip
@ -109,25 +178,18 @@ export default function OpenInPlayground(props?: { context?: UIContext }) {
Open in Playground
</Button>
<Drawer
title="Playground"
title={tabComponent}
placement="right"
onClose={handleClose}
open={isDrawerVisible}
width="90%"
styles={{
header: { padding: '16px' },
header: { padding: '0 16px' },
body: { padding: '24px' },
}}
className="playground-drawer"
>
<StandardPlayground
getAgent={() => {
return agent;
}}
dryMode={true}
hideLogo={true}
key={contextLoadingCounter}
/>
{toolContent}
</Drawer>
</>
);

File diff suppressed because one or more lines are too long

View File

@ -46,6 +46,7 @@ Some advanced configs are also supported. Usually you don't need to use them.
| `OPENAI_USE_AZURE` | Optional. Set to "true" to use Azure OpenAI Service. See more details in the following section. |
| `MIDSCENE_OPENAI_INIT_CONFIG_JSON` | Optional. Custom JSON config for OpenAI SDK initialization |
| `MIDSCENE_OPENAI_SOCKS_PROXY` | Optional. Proxy configuration (e.g. "socks5://127.0.0.1:1080") |
| `MIDSCENE_PREFERRED_LANGUAGE` | Optional. The preferred language for the model response. The default is `Chinese` if the current timezone is GMT+8 and `English` otherwise. |
| `OPENAI_MAX_TOKENS` | Optional. Maximum tokens for model response |
### Debug configs

View File

@ -50,6 +50,7 @@ Midscene 默认集成了 OpenAI SDK 调用 AI 服务。使用这个 SDK 限定
| `OPENAI_USE_AZURE` | 可选。设置为 "true" 以使用 Azure OpenAI Service。更多详情请参阅后文 |
| `MIDSCENE_OPENAI_INIT_CONFIG_JSON` | 可选。OpenAI SDK 的初始化配置 JSON |
| `MIDSCENE_OPENAI_SOCKS_PROXY` | 可选。代理配置 (如 "socks5://127.0.0.1:1080") |
| `MIDSCENE_PREFERRED_LANGUAGE` | 可选。模型响应的语言。如果当前时区是 GMT+8 则默认是 `Chinese`,否则是 `English` |
| `OPENAI_MAX_TOKENS` | 可选。模型响应的 max_tokens 数 |
### 调试配置

View File

@ -39,6 +39,7 @@ export enum AIActionType {
INSPECT_ELEMENT = 1,
EXTRACT_DATA = 2,
PLAN = 3,
DESCRIBE_ELEMENT = 4,
}
export async function callAiFn<T>(

View File

@ -11,6 +11,7 @@ import type {
ElementById,
ElementTreeNode,
Rect,
ReferenceImage,
UIContext,
} from '@/types';
import {
@ -73,6 +74,7 @@ export async function AiLocateElement<
>(options: {
context: UIContext<ElementType>;
targetElementDescription: string;
referenceImage?: ReferenceImage;
callAI?: typeof callAiFn<AIElementResponse | [number, number]>;
searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;
}): Promise<{
@ -121,6 +123,15 @@ export async function AiLocateElement<
);
}
let referenceImagePayload: string | undefined;
if (options.referenceImage?.rect && options.referenceImage.base64) {
referenceImagePayload = await cropByRect(
options.referenceImage.base64,
options.referenceImage.rect,
getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL),
);
}
const msgs: AIArgs = [
{ role: 'system', content: systemPrompt },
{

View File

@ -1,7 +1,8 @@
import { getPreferredLanguage } from '@midscene/shared/env';
import type { ResponseFormatJSONSchema } from 'openai/resources';
import { getTimeZoneInfo } from './ui-tars-planning';
export const language = getTimeZoneInfo().isChina ? 'Chinese' : 'English';
const preferredLanguage = getPreferredLanguage();
const defaultAssertionPrompt =
'You are a senior testing engineer. User will give an assertion and a screenshot of a page. By carefully viewing the screenshot, please tell whether the assertion is truthy.';
@ -21,7 +22,7 @@ const uiTarsAssertionResponseJsonFormat = `## Output Json String Format
## Rules **MUST** follow
- Make sure to return **only** the JSON, with **no additional** text or explanations.
- Use ${language} in \`thought\` part.
- Use ${preferredLanguage} in \`thought\` part.
- You **MUST** strictly follow up the **Output Json String Format**.`;
export function systemPromptToAssert(model: { isUITars: boolean }) {

View File

@ -0,0 +1,26 @@
import { getPreferredLanguage } from '@midscene/shared/env';
const preferredLanguage = getPreferredLanguage();
export const elementDescriberInstruction = () => {
return `Tell what is the content of the element wrapped by the read rectangle in the screenshot. Your description is expected to be used to precisely locate the element from other similar elements on screenshot. Use ${preferredLanguage} in the description.
Please follow the following rules:
1. The description should be start with a brief description, like "a button for confirming the action".
2. Include these information in the description to distinguish the element from its siblings and other similar elements, as much as possible:
- The text of the element, like "with text 'Confirm'"
- What the element looks like if it's an image, like "with image '...'"
- The relative position of the element, like "on the left of ..., around ..."
- How to distinguish the element from its siblings elements, like "it is the icon instead of the text"
3. Do NOT mention the red rectangle in the description.
4. Use the error field to describe the unexpected situations, if any. If not, put null.
Return in JSON:
{
"description": "[{brief description}]: {text of the element} {image of the element} {relative position of the element} ... ",
"error"?: "..."
}`;
};

View File

@ -1,19 +1,7 @@
export function getTimeZoneInfo(): { timezone: string; isChina: boolean } {
const timeZone = Intl.DateTimeFormat().resolvedOptions().timeZone;
const offset = -new Date().getTimezoneOffset() / 60;
return {
timezone: `UTC${offset >= 0 ? '+' : ''}${offset}`,
isChina: timeZone === 'Asia/Shanghai',
};
}
export function getLanguage(): string {
return getTimeZoneInfo().isChina ? 'Chinese' : 'English';
}
import { getPreferredLanguage } from '@midscene/shared/env';
export function getUiTarsPlanningPrompt(): string {
const language = getLanguage();
const preferredLanguage = getPreferredLanguage();
return `
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
@ -38,7 +26,7 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
## Note
- Use ${language} in \`Thought\` part.
- Use ${preferredLanguage} in \`Thought\` part.
- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
## User Instruction

View File

@ -253,13 +253,13 @@ export async function call(
let content: string | undefined;
let usage: OpenAI.CompletionUsage | undefined;
const commonConfig = {
temperature: getAIConfigInBoolean(MIDSCENE_USE_VLM_UI_TARS) ? 0.0 : 0.1,
temperature: vlLocateMode() === 'vlm-ui-tars' ? 0.0 : 0.1,
stream: false,
max_tokens:
typeof maxTokens === 'number'
? maxTokens
: Number.parseInt(maxTokens || '2048', 10),
...(getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL) // qwen specific config
...(vlLocateMode() === 'qwen-vl' // qwen specific config
? {
vl_high_resolution_images: true,
}
@ -359,14 +359,13 @@ export async function callToGetJSONObject<T>(
case AIActionType.INSPECT_ELEMENT:
responseFormat = locatorSchema;
break;
case AIActionType.EXTRACT_DATA:
//TODO: Currently the restriction type can only be a json subset of the constraint, and the way the extract api is used needs to be adjusted to limit the user's data to this as well
// targetResponseFormat = extractDataSchema;
responseFormat = { type: AIResponseFormat.JSON };
break;
case AIActionType.PLAN:
responseFormat = planSchema;
break;
case AIActionType.EXTRACT_DATA:
case AIActionType.DESCRIBE_ELEMENT:
responseFormat = { type: AIResponseFormat.JSON };
break;
}
}

View File

@ -1,7 +1,18 @@
import { callAiFn } from '@/ai-model/common';
import { AiExtractElementInfo, AiLocateElement } from '@/ai-model/index';
import {
AIActionType,
type AIArgs,
callAiFn,
expandSearchArea,
} from '@/ai-model/common';
import {
AiExtractElementInfo,
AiLocateElement,
callToGetJSONObject,
} from '@/ai-model/index';
import { AiAssert, AiLocateSection } from '@/ai-model/inspect';
import { elementDescriberInstruction } from '@/ai-model/prompt/describe';
import type {
AIDescribeElementResponse,
AIElementResponse,
AISingleElementResponse,
AIUsageInfo,
@ -20,9 +31,11 @@ import type {
} from '@/types';
import {
MIDSCENE_FORCE_DEEP_THINK,
MIDSCENE_USE_QWEN_VL,
getAIConfigInBoolean,
vlLocateMode,
} from '@midscene/shared/env';
import { compositeElementInfoImg, cropByRect } from '@midscene/shared/img';
import { getDebug } from '@midscene/shared/logger';
import { assert } from '@midscene/shared/utils';
import { emitInsightDump } from './utils';
@ -327,4 +340,74 @@ export default class Insight<
usage: assertResult.usage,
};
}
async describe(
target: Rect | [number, number],
opt?: {
deepThink?: boolean;
},
): Promise<Pick<AIDescribeElementResponse, 'description'>> {
assert(target, 'target is required for insight.describe');
const context = await this.contextRetrieverFn('describe');
const { screenshotBase64 } = context;
assert(screenshotBase64, 'screenshot is required for insight.describe');
const systemPrompt = elementDescriberInstruction();
// Convert [x,y] center point to Rect if needed
const defaultRectSize = 30;
const targetRect: Rect = Array.isArray(target)
? {
left: Math.floor(target[0] - defaultRectSize / 2),
top: Math.floor(target[1] - defaultRectSize / 2),
width: defaultRectSize,
height: defaultRectSize,
}
: target;
let imagePayload = await compositeElementInfoImg({
inputImgBase64: screenshotBase64,
elementsPositionInfo: [
{
rect: targetRect,
},
],
borderThickness: 3,
});
if (opt?.deepThink) {
const searchArea = expandSearchArea(targetRect, context.size);
debug('describe: set searchArea', searchArea);
imagePayload = await cropByRect(
imagePayload,
searchArea,
getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL),
);
}
const msgs: AIArgs = [
{ role: 'system', content: systemPrompt },
{
role: 'user',
content: [
{
type: 'image_url',
image_url: {
url: imagePayload,
detail: 'high',
},
},
],
},
];
const callAIFn =
this.aiVendorFn || callToGetJSONObject<AIDescribeElementResponse>;
const res = await callAIFn(msgs, AIActionType.DESCRIBE_ELEMENT);
const { content } = res;
assert(!content.error, `describe failed: ${content.error}`);
assert(content.description, 'failed to describe the element');
return content;
}
}

View File

@ -119,6 +119,28 @@ export interface AIAssertionResponse {
thought: string;
}
export interface AIDescribeElementResponse {
description: string;
error?: string;
}
export interface LocatorValidatorOption {
centerDistanceThreshold?: number;
}
export interface LocateValidatorResult {
pass: boolean;
rect: Rect;
center: [number, number];
centerDistance?: number;
}
export interface AgentDescribeElementAtPointResult {
prompt: string;
deepThink: boolean;
verifyResult?: LocateValidatorResult;
}
/**
* context
*/
@ -157,7 +179,7 @@ export interface InsightOptions {
export type EnsureObject<T> = { [K in keyof T]: any };
export type InsightAction = 'locate' | 'extract' | 'assert';
export type InsightAction = 'locate' | 'extract' | 'assert' | 'describe';
export type InsightExtractParam = string | Record<string, string>;

View File

@ -1,4 +1,4 @@
import type { PlanningActionParamScroll } from './types';
import type { PlanningActionParamScroll, Rect } from './types';
export interface LocateOption {
prompt?: string;
@ -6,8 +6,14 @@ export interface LocateOption {
cacheable?: boolean; // user can set this param to false to disable the cache for a single agent api
}
export interface ReferenceImage {
base64: string;
rect?: Rect;
}
export interface DetailedLocateParam extends LocateOption {
prompt: string;
referenceImage?: ReferenceImage;
}
export interface scrollParam {

View File

@ -25,7 +25,7 @@ describe.skipIf(!vlMode)('insight locate with deep think', () => {
await sleep(3000);
});
test('insight locate with search area and think twice', async () => {
test('insight locate with search area - deep think', async () => {
const { context } = await getContextFromFixture('taobao');
const insight = new Insight(context);
@ -66,3 +66,26 @@ test.skip('insight locate with search area', async () => {
console.log(element, rect);
await sleep(3000);
});
describe('insight describe', () => {
test('insight describe - by rect', async () => {
const { context } = await getContextFromFixture('taobao');
const insight = new Insight(context);
const { description } = await insight.describe({
left: 580,
top: 140,
width: 80,
height: 30,
});
expect(description).toBeDefined();
});
test('insight describe - by center point', async () => {
const { context } = await getContextFromFixture('taobao');
const insight = new Insight(context);
const { description } = await insight.describe([580, 140]);
expect(description).toBeDefined();
});
});

View File

@ -0,0 +1,24 @@
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
exports[`elementDescriberInstruction > should return the correct instruction 1`] = `
"Tell what is the content of the element wrapped by the read rectangle in the screenshot. Your description is expected to be used to precisely locate the element from other similar elements on screenshot. Use English in the description.
Please follow the following rules:
1. The description should be start with a brief description, like "a button for confirming the action".
2. Include these information in the description to distinguish the element from its siblings and other similar elements, as much as possible:
- The text of the element, like "with text 'Confirm'"
- What the element looks like if it's an image, like "with image '...'"
- The relative position of the element, like "on the left of ..., around ..."
- How to distinguish the element from its siblings elements, like "it is the icon instead of the text"
3. Do NOT mention the red rectangle in the description.
4. Use the error field to describe the unexpected situations, if any. If not, put null.
Return in JSON:
{
"description": "[{brief description}]: {text of the element} {image of the element} {relative position of the element} ... ",
"error"?: "..."
}"
`;

View File

@ -2,18 +2,17 @@ import { systemPromptToAssert } from '@/ai-model/prompt/assertion';
import { describe, expect, it, vi } from 'vitest';
describe('Assertion prompt', () => {
vi.mock('@midscene/shared/env', () => ({
getPreferredLanguage: vi.fn().mockReturnValue('English'),
}));
it('return default when it is not UI-Tars', () => {
const prompt = systemPromptToAssert({ isUITars: false });
expect(prompt).toMatchSnapshot();
});
it('return UI-Tars specific when it is UI-Tars', () => {
vi.mock('@/ai-model/prompt/ui-tars-planning', () => ({
getTimeZoneInfo: vi.fn().mockReturnValue({ isChina: false }),
}));
const prompt = systemPromptToAssert({ isUITars: true });
expect(prompt).toMatchSnapshot();
});
});

View File

@ -0,0 +1,12 @@
import { elementDescriberInstruction } from '@/ai-model/prompt/describe';
import { describe, expect, it, vi } from 'vitest';
describe('elementDescriberInstruction', () => {
vi.mock('@midscene/shared/env', () => ({
getPreferredLanguage: vi.fn().mockReturnValue('English'),
}));
it('should return the correct instruction', () => {
expect(elementDescriberInstruction()).toMatchSnapshot();
});
});

View File

@ -1,32 +0,0 @@
import {
getLanguage,
getTimeZoneInfo,
} from '@/ai-model/prompt/ui-tars-planning';
import { afterEach, describe, expect, it } from 'vitest';
import { mockNonChinaTimeZone, restoreIntl } from '../mocks/intl-mock';
describe('UI TARS Planning Functions', () => {
afterEach(() => {
restoreIntl();
});
it('getTimeZoneInfo returns original timezone without mock', () => {
// This test will vary based on the system running it
const info = getTimeZoneInfo();
// We don't assert on specific values here as they depend on the local environment
expect(info).toHaveProperty('timezone');
expect(info).toHaveProperty('isChina');
expect(typeof info.timezone).toBe('string');
expect(typeof info.isChina).toBe('boolean');
});
it('getTimeZoneInfo returns non-China timezone with mock', () => {
mockNonChinaTimeZone();
const info = getTimeZoneInfo();
expect(info.isChina).toBe(false);
const language = getLanguage();
expect(language).toBe('English');
});
});

View File

@ -5,52 +5,49 @@
"prompt": "'最简单的用法'下方有五个 icon左侧第一个 icon",
"annotation_index_id": 1,
"response_rect": {
"left": 486,
"top": 859,
"width": 21,
"height": 14
"left": 538,
"top": 769,
"width": 12,
"height": 13
},
"response_element": {
"id": "nkpld",
"indexId": 104
"id": "klhmg"
}
},
{
"prompt": "'最简单的用法'下方有五个 icon左侧第二个 icon",
"annotation_index_id": 2,
"response_rect": {
"left": 519,
"top": 860,
"width": 15,
"height": 12
"left": 568,
"top": 739,
"width": 16,
"height": 16
},
"response_element": {
"id": "hdbbh",
"indexId": 105
"id": "mflch"
}
},
{
"prompt": "'最简单的用法'下方有五个 icon左侧第三个 icon",
"annotation_index_id": 3,
"response_rect": {
"left": 549,
"top": 860,
"width": 18,
"left": 538,
"top": 769,
"width": 12,
"height": 12
},
"response_element": {
"id": "ncono",
"indexId": 106
"id": "kbgij"
}
},
{
"prompt": "'最简单的用法'下方有五个 icon左侧第四个 icon",
"annotation_index_id": 4,
"response_rect": {
"left": 584,
"left": 583,
"top": 860,
"width": 13,
"height": 12
"width": 14,
"height": 14
},
"response_element": {
"id": "jkeam",
@ -61,70 +58,67 @@
"prompt": "'最简单的用法'下方有五个 icon最右侧的 icon",
"annotation_index_id": 5,
"response_rect": {
"left": 617,
"top": 862,
"width": 13,
"height": 9
"left": 587,
"top": 863,
"width": 14,
"height": 11
},
"response_element": {
"id": "nnkcf",
"indexId": 108
"id": "jkeam",
"indexId": 107
}
},
{
"prompt": "全屏幕右上角、版本号右侧有三个 icon ,查找左侧第一个",
"annotation_index_id": 6,
"response_rect": {
"left": 1269,
"top": 24,
"width": 20,
"height": 13
"left": 876,
"top": 35,
"width": 48,
"height": 26
},
"response_element": {
"id": "dinoj",
"indexId": 13
"id": "jmdcl"
}
},
{
"prompt": "全屏幕右上角有三个 icon ,查找左侧第二个",
"annotation_index_id": 7,
"response_rect": {
"left": 1309,
"top": 24,
"width": 26,
"height": 16
"left": 876,
"top": 35,
"width": 34,
"height": 7
},
"response_element": {
"id": "nfpha",
"indexId": 14
"id": "hdpjf"
}
},
{
"prompt": "屏幕右上角有三个 icon ,左侧第三个",
"annotation_index_id": 8,
"response_rect": {
"left": 1356,
"top": 24,
"width": 20,
"height": 16
"left": 867,
"top": 35,
"width": 59,
"height": 29
},
"response_element": {
"id": "hmbld",
"indexId": 15
"id": "jnobj"
}
},
{
"prompt": "在‘代码演示’右侧有三个 icon 按钮中,查找最中间的按钮",
"annotation_index_id": 9,
"response_rect": {
"left": 1184,
"top": 497,
"width": 16,
"height": 16
"left": 863,
"top": 574,
"width": 216,
"height": 33
},
"response_element": {
"id": "pkafb",
"indexId": 94
"id": "mjcce",
"indexId": 112
}
}
]

Binary file not shown.

Before

Width:  |  Height:  |  Size: 356 KiB

After

Width:  |  Height:  |  Size: 357 KiB

View File

@ -6,9 +6,9 @@
"multi": false,
"annotation_index_id": 1,
"response_rect": {
"left": 721,
"top": 245,
"width": 72,
"left": 723,
"top": 246,
"width": 86,
"height": 15
},
"response_element": {
@ -21,10 +21,10 @@
"multi": false,
"annotation_index_id": 2,
"response_rect": {
"left": 485,
"top": 246,
"width": 72,
"height": 15
"left": 486,
"top": 239,
"width": 84,
"height": 28
},
"response_element": {
"id": "aonmh",
@ -36,14 +36,13 @@
"multi": false,
"annotation_index_id": 3,
"response_rect": {
"left": 492,
"top": 341,
"width": 294,
"height": 49
"left": 493,
"top": 367,
"width": 308,
"height": 25
},
"response_element": {
"id": "mfodf",
"indexId": 10
"id": "aelca"
}
},
{
@ -51,14 +50,13 @@
"multi": false,
"annotation_index_id": 4,
"response_rect": {
"left": 492,
"top": 418,
"width": 294,
"height": 50
"left": 487,
"top": 365,
"width": 303,
"height": 47
},
"response_element": {
"id": "nhbof",
"indexId": 12
"id": "dahmc"
}
},
{
@ -67,9 +65,9 @@
"annotation_index_id": 5,
"response_rect": {
"left": 697,
"top": 435,
"width": 71,
"height": 14
"top": 438,
"width": 68,
"height": 13
},
"response_element": {
"id": "kdbdc",
@ -81,10 +79,10 @@
"multi": false,
"annotation_index_id": 6,
"response_rect": {
"left": 492,
"top": 558,
"width": 294,
"height": 45
"left": 603,
"top": 574,
"width": 86,
"height": 18
},
"response_element": {
"id": "bjnpl",
@ -96,10 +94,10 @@
"multi": false,
"annotation_index_id": 7,
"response_rect": {
"left": 845,
"top": 120,
"width": 15,
"height": 15
"left": 846,
"top": 123,
"width": 11,
"height": 16
},
"response_element": {
"id": "aigcl",

Binary file not shown.

Before

Width:  |  Height:  |  Size: 191 KiB

After

Width:  |  Height:  |  Size: 192 KiB

View File

@ -5,9 +5,9 @@
"prompt": "左下角暂停按钮",
"annotation_index_id": 1,
"response_rect": {
"left": 17,
"left": 8,
"top": 769,
"width": 15,
"width": 27,
"height": 15
},
"response_element": {
@ -19,70 +19,65 @@
"prompt": "点赞(爱心)按钮",
"annotation_index_id": 2,
"response_rect": {
"left": 1204,
"top": 352,
"width": 23,
"height": 19
"left": 1348,
"top": 569,
"width": 24,
"height": 21
},
"response_element": {
"id": "ebgie",
"indexId": 23
"id": "bomgc"
}
},
{
"prompt": "评论按钮",
"annotation_index_id": 3,
"response_rect": {
"left": 1205,
"top": 426,
"width": 22,
"height": 17
"left": 1346,
"top": 578,
"width": 46,
"height": 42
},
"response_element": {
"id": "cjmim",
"indexId": 25
"id": "hiono"
}
},
{
"prompt": "书签收藏按钮",
"annotation_index_id": 4,
"response_rect": {
"left": 1203,
"top": 498,
"width": 27,
"height": 22
"left": -1,
"top": -3598647,
"width": 1285,
"height": 3599459
},
"response_element": {
"id": "moimk",
"indexId": 27
"id": "fkleb"
}
},
{
"prompt": "分享按钮",
"annotation_index_id": 5,
"response_rect": {
"left": 1203,
"top": 576,
"width": 25,
"height": 18
"left": -3568497,
"top": -3568497,
"width": 0,
"height": 0
},
"response_element": {
"id": "mgcne",
"indexId": 29
"id": "dcppg"
}
},
{
"prompt": "右下角区域声音按钮",
"annotation_index_id": 6,
"response_rect": {
"left": 1203,
"top": 769,
"width": 22,
"height": 15
"left": 1176,
"top": 768,
"width": 27,
"height": 26
},
"response_element": {
"id": "djknm",
"indexId": 10
"id": "eodda"
}
}
]

Binary file not shown.

Before

Width:  |  Height:  |  Size: 117 KiB

After

Width:  |  Height:  |  Size: 120 KiB

View File

@ -6,10 +6,10 @@
"multi": false,
"annotation_index_id": 1,
"response_rect": {
"left": 17,
"top": 20,
"width": 22,
"height": 16
"left": 9,
"top": 18,
"width": 34,
"height": 39
},
"response_element": {
"id": "amjle",
@ -21,14 +21,13 @@
"multi": false,
"annotation_index_id": 2,
"response_rect": {
"left": 58,
"top": 16,
"width": 66,
"height": 23
"left": 95,
"top": 34,
"width": 83,
"height": 6
},
"response_element": {
"id": "kfmhg",
"indexId": 1
"id": "mbdlb"
}
},
{
@ -36,10 +35,10 @@
"multi": false,
"annotation_index_id": 3,
"response_rect": {
"left": 352,
"top": 19,
"width": 20,
"height": 17
"left": 354,
"top": 18,
"width": 18,
"height": 21
},
"response_element": {
"id": "podpa",
@ -51,14 +50,13 @@
"multi": false,
"annotation_index_id": 4,
"response_rect": {
"left": 190,
"top": 724,
"width": 38,
"height": 19
"left": 348,
"top": 759,
"width": 14,
"height": 8
},
"response_element": {
"id": "dmggl",
"indexId": 20
"id": "eagmn"
}
},
{
@ -66,10 +64,10 @@
"multi": false,
"annotation_index_id": 5,
"response_rect": {
"left": 301,
"top": 864,
"width": 86,
"height": 18
"left": 304,
"top": 859,
"width": 83,
"height": 23
},
"response_element": {
"id": "cdmma",
@ -81,14 +79,13 @@
"multi": false,
"annotation_index_id": 6,
"response_rect": {
"left": 369,
"top": 825,
"width": 21,
"height": 16
"left": -348,
"top": -579,
"width": 87,
"height": 271
},
"response_element": {
"id": "ddeal",
"indexId": 27
"id": "dcppg"
}
}
]

Binary file not shown.

Before

Width:  |  Height:  |  Size: 373 KiB

After

Width:  |  Height:  |  Size: 377 KiB

View File

@ -6,14 +6,13 @@
"multi": false,
"annotation_index_id": 1,
"response_rect": {
"left": 187,
"top": 276,
"width": 83,
"height": 19
"left": 89,
"top": 167,
"width": 361,
"height": 46
},
"response_element": {
"id": "eabha",
"indexId": 20
"id": "bkggi"
}
},
{
@ -21,10 +20,10 @@
"multi": false,
"annotation_index_id": 2,
"response_rect": {
"left": 324,
"top": 801,
"width": 62,
"height": 16
"left": 348,
"top": 796,
"width": 37,
"height": 25
},
"response_element": {
"id": "kmdfd",
@ -36,14 +35,13 @@
"multi": false,
"annotation_index_id": 3,
"response_rect": {
"left": 189,
"top": 802,
"width": 40,
"height": 15
"left": 94,
"top": 765,
"width": 294,
"height": 37
},
"response_element": {
"id": "eommc",
"indexId": 38
"id": "lmngi"
}
},
{
@ -51,10 +49,10 @@
"multi": false,
"annotation_index_id": 4,
"response_rect": {
"left": 6,
"top": 382,
"width": 34,
"height": 13
"left": 4,
"top": 385,
"width": 63,
"height": 24
},
"response_element": {
"id": "lalae",

Binary file not shown.

Before

Width:  |  Height:  |  Size: 289 KiB

After

Width:  |  Height:  |  Size: 292 KiB

View File

@ -6,13 +6,14 @@
"multi": false,
"annotation_index_id": 1,
"response_rect": {
"left": 319,
"top": 54,
"width": 533,
"height": 40
"left": 346,
"top": 57,
"width": 546,
"height": 34
},
"response_element": {
"id": "aljah"
"id": "jfjah",
"indexId": 27
}
},
{
@ -20,10 +21,10 @@
"multi": false,
"annotation_index_id": 2,
"response_rect": {
"left": 792,
"top": 65,
"width": 38,
"height": 16
"left": 803,
"top": 64,
"width": 22,
"height": 15
},
"response_element": {
"id": "ondpi",
@ -36,14 +37,14 @@
"multi": false,
"annotation_index_id": 3,
"response_rect": {
"left": 118,
"top": 428,
"width": 53,
"height": 12
"left": 89,
"top": 430,
"width": 67,
"height": 15
},
"response_element": {
"id": "cjfcl",
"indexId": 99
"id": "hgioh",
"indexId": 98
}
},
{
@ -52,14 +53,13 @@
"multi": false,
"annotation_index_id": 4,
"response_rect": {
"left": 1065,
"top": 385,
"width": 21,
"height": 19
"left": 1191,
"top": 389,
"width": 43,
"height": 34
},
"response_element": {
"id": "fkfdl",
"indexId": 190
"id": "hpjie"
}
},
{
@ -68,29 +68,28 @@
"multi": false,
"annotation_index_id": 5,
"response_rect": {
"left": 1251,
"top": 352,
"width": 22,
"height": 22
"left": 1158,
"top": 389,
"width": 12,
"height": 14
},
"response_element": {
"id": "iegkg",
"indexId": 212
"id": "knffl"
}
},
{
"prompt": "顶部工具栏的购物车 icon",
"deepThink": true,
"response_rect": {
"left": 837,
"top": 12,
"width": 17,
"height": 14
"left": 869,
"top": 9,
"width": 21,
"height": 16
},
"annotation_index_id": 6,
"response_element": {
"id": "aefln",
"indexId": 12
"id": "mlkcg",
"indexId": 13
}
}
]

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 MiB

After

Width:  |  Height:  |  Size: 1.0 MiB

View File

@ -6,14 +6,13 @@
"multi": false,
"annotation_index_id": 1,
"response_rect": {
"left": 512,
"top": 127,
"width": 556,
"height": 71
"left": 548,
"top": 179,
"width": 55,
"height": 21
},
"response_element": {
"id": "okgbn",
"indexId": 18
"id": "lbjjf"
}
},
{
@ -21,14 +20,13 @@
"multi": false,
"annotation_index_id": 2,
"response_rect": {
"left": 512,
"top": 127,
"width": 556,
"height": 71
"left": -498,
"top": -576,
"width": 1701,
"height": 1304
},
"response_element": {
"id": "okgbn",
"indexId": 18
"id": "ffncc"
}
},
{
@ -36,14 +34,13 @@
"multi": false,
"annotation_index_id": 3,
"response_rect": {
"left": 574,
"top": 276,
"width": 117,
"height": 17
"left": 587,
"top": 274,
"width": 376,
"height": 24
},
"response_element": {
"id": "idmhb",
"indexId": 24
"id": "micif"
}
},
{
@ -51,10 +48,10 @@
"multi": false,
"annotation_index_id": 4,
"response_rect": {
"left": 1028,
"top": 279,
"width": 15,
"height": 12
"left": 987,
"top": 254,
"width": 76,
"height": 46
},
"response_element": {
"id": "jicbk",
@ -66,10 +63,10 @@
"multi": false,
"annotation_index_id": 5,
"response_rect": {
"left": 521,
"top": 334,
"width": 33,
"height": 26
"left": 527,
"top": 340,
"width": 21,
"height": 21
},
"response_element": {
"id": "kjccf",
@ -81,10 +78,10 @@
"multi": false,
"annotation_index_id": 6,
"response_rect": {
"left": 802,
"left": 804,
"top": 391,
"width": 69,
"height": 12
"width": 68,
"height": 14
},
"response_element": {
"id": "ddapc",

Binary file not shown.

Before

Width:  |  Height:  |  Size: 305 KiB

After

Width:  |  Height:  |  Size: 309 KiB

View File

@ -85,11 +85,6 @@ testSources.forEach((source) => {
indexId,
rect,
});
// // biome-ignore lint/performance/noDelete: <explanation>
// delete (testCase as any).response_bbox;
// // biome-ignore lint/performance/noDelete: <explanation>
// delete (testCase as any).response;
}
if (element) {

View File

@ -31,6 +31,8 @@ export const MATCH_BY_POSITION = 'MATCH_BY_POSITION';
export const MIDSCENE_API_TYPE = 'MIDSCENE-API-TYPE';
export const MIDSCENE_REPORT_TAG_NAME = 'MIDSCENE_REPORT_TAG_NAME';
export const MIDSCENE_PREFERRED_LANGUAGE = 'MIDSCENE_PREFERRED_LANGUAGE';
export const MIDSCENE_USE_AZURE_OPENAI = 'MIDSCENE_USE_AZURE_OPENAI';
export const MIDSCENE_AZURE_OPENAI_SCOPE = 'MIDSCENE_AZURE_OPENAI_SCOPE';
export const MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON =
@ -103,6 +105,8 @@ export const allConfigFromEnv = () => {
[MIDSCENE_MCP_USE_PUPPETEER_MODE]:
process.env[MIDSCENE_MCP_USE_PUPPETEER_MODE] || undefined,
[MIDSCENE_RUN_DIR]: process.env[MIDSCENE_RUN_DIR] || undefined,
[MIDSCENE_PREFERRED_LANGUAGE]:
process.env[MIDSCENE_PREFERRED_LANGUAGE] || undefined,
};
};
@ -244,3 +248,13 @@ export const overrideAIConfig = (
? { ...currentConfig, ...newConfig }
: { ...newConfig };
};
export const getPreferredLanguage = () => {
if (getAIConfig(MIDSCENE_PREFERRED_LANGUAGE)) {
return getAIConfig(MIDSCENE_PREFERRED_LANGUAGE);
}
const timeZone = Intl.DateTimeFormat().resolvedOptions().timeZone;
const isChina = timeZone === 'Asia/Shanghai';
return isChina ? 'Chinese' : 'English';
};

View File

@ -1,6 +1,6 @@
import assert from 'node:assert';
import type Jimp from 'jimp';
import type { BaseElement } from '../types';
import type { BaseElement, Rect } from '../types';
import getJimp from './get-jimp';
import { bufferFromBase64, imageInfoOfBase64 } from './index';
@ -21,11 +21,17 @@ const loadFonts = async () => {
}
};
interface ElementForOverlay {
rect: Rect;
indexId?: number;
}
const createSvgOverlay = async (
elements: Array<BaseElement>,
elements: Array<ElementForOverlay>,
imageWidth: number,
imageHeight: number,
boxPadding = 5,
borderThickness = 2,
prompt?: string,
): Promise<Jimp> => {
const Jimp = await getJimp();
@ -86,17 +92,21 @@ const createSvgOverlay = async (
const color = colors[index % colors.length];
// Add 5px padding to the rect
const paddedLeft = Math.max(0, element.rect.left - boxPadding);
const paddedTop = Math.max(0, element.rect.top - boxPadding);
const paddedWidth = Math.min(
imageWidth - paddedLeft,
element.rect.width + boxPadding * 2,
);
const paddedHeight = Math.min(
imageHeight - paddedTop,
element.rect.height + boxPadding * 2,
);
const paddedRect = {
left: Math.max(0, element.rect.left - boxPadding),
top: Math.max(0, element.rect.top - boxPadding),
width: Math.min(
imageWidth - element.rect.left,
element.rect.width + boxPadding * 2,
),
height: Math.min(
imageHeight - element.rect.top,
element.rect.height + boxPadding * 2,
),
left: paddedLeft,
top: paddedTop,
width: paddedWidth,
height: paddedHeight,
};
// Draw rectangle
@ -107,10 +117,12 @@ const createSvgOverlay = async (
paddedRect.height,
(x: number, y: number, idx: number): void => {
if (
x === paddedRect.left ||
x === paddedRect.left + paddedRect.width - 1 ||
y === paddedRect.top ||
y === paddedRect.top + paddedRect.height - 1
(x >= paddedRect.left && x < paddedRect.left + borderThickness) || // Left border
(x <= paddedRect.left + paddedRect.width - 1 &&
x > paddedRect.left + paddedRect.width - borderThickness) || // Right border
(y >= paddedRect.top && y < paddedRect.top + borderThickness) || // Top border
(y <= paddedRect.top + paddedRect.height - 1 &&
y > paddedRect.top + paddedRect.height - borderThickness) // Bottom border
) {
image.bitmap.data[idx + 0] = (color.rect >> 24) & 0xff; // R
image.bitmap.data[idx + 1] = (color.rect >> 16) & 0xff; // G
@ -234,9 +246,10 @@ const createSvgOverlay = async (
export const compositeElementInfoImg = async (options: {
inputImgBase64: string;
elementsPositionInfo: Array<BaseElement>;
elementsPositionInfo: Array<ElementForOverlay>;
size?: { width: number; height: number };
annotationPadding?: number;
borderThickness?: number;
prompt?: string;
}) => {
assert(options.inputImgBase64, 'inputImgBase64 is required');
@ -280,6 +293,7 @@ export const compositeElementInfoImg = async (options: {
width,
height,
options.annotationPadding,
options.borderThickness,
prompt,
);
const svgImage = await Jimp.read(svgOverlay);

View File

@ -12,10 +12,26 @@ import { useBlackboardPreference } from './store/store';
const itemFillAlpha = 0.4;
const highlightAlpha = 0.4;
const pointRadius = 10;
const noop = () => {
// noop
};
export const pointMarkForItem = (
point: [number, number],
type: 'highlightPoint',
) => {
const [x, y] = point;
const themeColor = highlightColorForType('element');
const graphics = new PIXI.Graphics();
// draw a circle
graphics.beginFill(themeColor, itemFillAlpha);
graphics.drawCircle(x, y, pointRadius);
graphics.endFill();
return graphics;
};
export const rectMarkForItem = (
rect: Rect,
name: string,
@ -49,6 +65,9 @@ export const rectMarkForItem = (
graphics.filters = [dropShadowFilter];
const nameFontSize = 18;
if (!name) {
return [graphics];
}
const texts = new PIXI.Text(name, {
fontSize: nameFontSize,
fill: 0x0,
@ -62,11 +81,14 @@ export const Blackboard = (props: {
uiContext: UIContext;
highlightElements?: BaseElement[];
highlightRect?: Rect;
highlightPoints?: [number, number][];
hideController?: boolean;
onCanvasClick?: (position: [number, number]) => void;
}): JSX.Element => {
const highlightElements: BaseElement[] = props.highlightElements || [];
const highlightIds = highlightElements.map((e) => e.id);
const highlightRect = props.highlightRect;
const highlightPoints = props.highlightPoints;
const context = props.uiContext!;
const { size, screenshotBase64 } = context;
@ -128,6 +150,28 @@ export const Blackboard = (props: {
};
}, [app, screenWidth, screenHeight]);
useEffect(() => {
if (!appInitialed) {
return;
}
// Enable interaction on the stage and all its children
app.stage.eventMode = 'static';
app.stage.hitArea = new PIXI.Rectangle(0, 0, screenWidth, screenHeight);
const clickHandler = (event: PIXI.FederatedPointerEvent) => {
console.log('pixi click', event);
const { x, y } = event.data.global;
props.onCanvasClick?.([Math.round(x), Math.round(y)]);
};
app.stage.on('click', clickHandler);
return () => {
app?.stage?.off('click');
};
}, [appInitialed, props.onCanvasClick, screenWidth, screenHeight]);
// draw all texts on PIXI app
useEffect(() => {
if (!appInitialed) {
@ -144,13 +188,18 @@ export const Blackboard = (props: {
backgroundSprite.y = 0;
backgroundSprite.width = screenWidth;
backgroundSprite.height = screenHeight;
// Ensure the background doesn't block interactivity
backgroundSprite.eventMode = 'passive';
app.stage.addChildAt(backgroundSprite, 0);
pixiBgRef.current = backgroundSprite;
};
img.onerror = (e) => {
console.error('load screenshot failed', e);
};
img.src = screenshotBase64;
}, [app.stage, appInitialed]);
}, [app.stage, appInitialed, screenWidth, screenHeight]);
const { highlightElementRects } = useMemo(() => {
const highlightElementRects: Rect[] = [];
@ -158,8 +207,11 @@ export const Blackboard = (props: {
highlightContainer.removeChildren();
elementMarkContainer.removeChildren();
// Make containers interactive but not blocking events
highlightContainer.eventMode = 'passive';
elementMarkContainer.eventMode = 'passive';
if (highlightRect) {
console.log('highlightRect', highlightRect);
const [graphics] = rectMarkForItem(
highlightRect,
'Search Area',
@ -176,6 +228,13 @@ export const Blackboard = (props: {
});
}
if (highlightPoints?.length) {
highlightPoints.forEach((point) => {
const graphics = pointMarkForItem(point, 'highlightPoint');
highlightContainer.addChild(graphics);
});
}
// element rects
context.content.forEach((element) => {
const { rect, content, id } = element;
@ -200,6 +259,7 @@ export const Blackboard = (props: {
context.content,
hoverElement,
highlightRect,
highlightPoints,
// bgVisible,
// elementsVisible,
]);

View File

@ -0,0 +1,24 @@
.image-describer {
position: relative;
.describe-text {
box-sizing: border-box;
position: absolute;
background: #000;
width: 100%;
height: 30px;
left: 0;
bottom: 0;
color: #FFF;
font-size: 12px;
padding: 10px;
}
.describe-text.success {
background: #047704;
}
.describe-text.error {
background: #870707;
}
}

View File

@ -0,0 +1,150 @@
'use client';
import type {
AgentDescribeElementAtPointResult,
Rect,
UIContext,
} from '@midscene/core';
import type { WebUIContext } from '@midscene/web/utils';
import { useEffect, useRef, useState } from 'react';
import { useStaticPageAgent } from './playground/useStaticPageAgent';
import './describer.less';
import { Panel, PanelGroup, PanelResizeHandle } from 'react-resizable-panels';
import { Blackboard } from './blackboard';
import { PlaygroundResultView } from './playground/PlaygroundResult';
export const Describer = (props: { uiContext: UIContext }): JSX.Element => {
const { uiContext } = props;
const image = uiContext.screenshotBase64;
const canvasRef = useRef<HTMLCanvasElement>(null);
const [highlightPoints, setHighlightPoints] = useState<[number, number][]>(
[],
);
const [highlightRect, setHighlightRect] = useState<Rect | undefined>();
const [error, setError] = useState<string | undefined>();
const [loading, setLoading] = useState(false);
const [result, setResult] = useState<
AgentDescribeElementAtPointResult | undefined
>();
const agent = useStaticPageAgent(uiContext as WebUIContext);
useEffect(() => {
const canvas = canvasRef.current;
if (!canvas || !image) return;
const ctx = canvas.getContext('2d');
if (!ctx) return;
const img = new Image();
img.onload = () => {
// Set canvas dimensions to match the image
canvas.width = img.width;
canvas.height = img.height;
// Draw the image on the canvas
ctx.drawImage(img, 0, 0);
};
// Set the image source (base64 data)
img.src = image;
}, [image]);
const handleClick = async (position: [number, number]) => {
if (!agent) {
console.error('agent is not initialized');
return;
}
setLoading(true);
setError(undefined);
setResult(undefined);
setHighlightPoints([]);
setHighlightRect(undefined);
try {
const userLocation: [number, number] = [position[0], position[1]];
setHighlightPoints([userLocation]);
const result = await agent?.describeElementAtPoint(userLocation);
console.log('describe result', result);
setResult(result);
if (result.verifyResult?.rect) {
setHighlightRect(result.verifyResult.rect);
}
} catch (error: any) {
setError(error.message);
} finally {
setLoading(false);
}
};
let resultText = '';
if (error) {
resultText = error;
} else if (result && !result.verifyResult?.pass) {
resultText = `Locate failed with prompt: ${result.prompt}`;
} else if (result) {
if (result.deepThink) {
resultText = `Deep think: ${result.prompt}`;
} else {
resultText = result.prompt;
}
} else if (loading) {
resultText = 'Loading...';
}
return (
<div className="image-describer">
<PanelGroup autoSaveId="describer-layout" direction="horizontal">
<Panel
defaultSize={32}
maxSize={60}
minSize={20}
style={{ paddingRight: '24px' }}
>
<div className="form-part context-panel">
<h3>Screenshot</h3>
<div className="form-sub-title">
Click on the screenshot, Midscene will help you describe the
element at the clicked point.
</div>
<Blackboard
uiContext={{
...uiContext,
content: [], // remove all contents
tree: {
node: null,
children: [],
},
}}
highlightPoints={highlightPoints}
highlightRect={highlightRect}
onCanvasClick={handleClick}
hideController={true}
/>
</div>
</Panel>
<PanelResizeHandle className="panel-resize-handle" />
<Panel>
<PlaygroundResultView
result={{
result: resultText,
error: error || null,
}}
loading={loading}
serverValid={true}
serviceMode={'In-Browser'}
replayScriptsInfo={null}
replayCounter={0}
loadingProgressText={''}
/>
</Panel>
</PanelGroup>
</div>
);
};
export default Describer;

View File

@ -6,7 +6,8 @@ body {
font-size: 14px;
}
.playground-container {
.playground-container,
.image-describer {
width: 100%;
height: 100%;
@ -35,7 +36,6 @@ body {
overflow-y: auto !important;
.ant-form {
flex-grow: 1;
display: flex;
flex-direction: column;
@ -50,6 +50,11 @@ body {
font-size: 18px;
}
.form-sub-title {
margin-bottom: 12px;
font-size: 14px;
}
.switch-btn-wrapper {
.ant-btn {
padding: 0;

View File

@ -6,8 +6,8 @@ import type { WebUIContext } from '@midscene/web/utils';
// result type
export interface PlaygroundResult {
result: any;
dump: GroupedActionDump | null;
reportHTML: string | null;
dump?: GroupedActionDump | null;
reportHTML?: string | null;
error: string | null;
}

View File

@ -28,6 +28,7 @@ export { PromptInput } from './component/playground/PromptInput';
export { Player } from './component/player';
export { Blackboard } from './component/blackboard';
export { GithubStar } from './component/github-star';
export { Describer } from './component/describer';
// Export playground utilities
export {

View File

@ -144,7 +144,6 @@ export class ExtensionBridgePageBrowserSide extends ChromeExtensionProxyPage {
},
) {
const tabs = await chrome.tabs.query({ active: true, currentWindow: true });
console.log('current tab', tabs);
const tabId = tabs[0]?.id;
assert(tabId, 'failed to get tabId');

View File

@ -1,6 +1,7 @@
import type { WebPage } from '@/common/page';
import {
type AgentAssertOpt,
type AgentDescribeElementAtPointResult,
type AgentWaitForOpt,
type DetailedLocateParam,
type ExecutionDump,
@ -11,9 +12,12 @@ import {
type InsightAction,
type LocateOption,
type LocateResultElement,
type LocateValidatorResult,
type LocatorValidatorOption,
type MidsceneYamlScript,
type OnTaskStartTip,
type PlanningActionParamScroll,
type Rect,
} from '@midscene/core';
import yaml from 'js-yaml';
@ -49,6 +53,18 @@ import { type WebUIContext, parseContextFromWebPage } from './utils';
const debug = getDebug('web-integration');
const distanceOfTwoPoints = (p1: [number, number], p2: [number, number]) => {
const [x1, y1] = p1;
const [x2, y2] = p2;
return Math.round(Math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2));
};
const includedInRect = (point: [number, number], rect: Rect) => {
const [x, y] = point;
const { left, top, width, height } = rect;
return x >= left && x <= left + width && y >= top && y <= top + height;
};
export interface PageAgentOpt {
forceSameTabNavigation?: boolean /* if limit the new tab to the current page, default true */;
testId?: string;
@ -404,6 +420,88 @@ export class PageAgent<PageType extends WebPage = WebPage> {
return output;
}
async describeElementAtPoint(
center: [number, number],
opt?: {
verifyPrompt?: boolean;
retryLimit?: number;
deepThink?: boolean;
} & LocatorValidatorOption,
): Promise<AgentDescribeElementAtPointResult> {
const { verifyPrompt = true, retryLimit = 3 } = opt || {};
let success = false;
let retryCount = 0;
let resultPrompt = '';
let deepThink = opt?.deepThink || false;
let verifyResult: LocateValidatorResult | undefined;
while (!success && retryCount < retryLimit) {
if (retryCount >= 2) {
deepThink = true;
}
debug(
'aiDescribe',
center,
'verifyPrompt',
verifyPrompt,
'retryCount',
retryCount,
'deepThink',
deepThink,
);
const text = await this.insight.describe(center, { deepThink });
debug('aiDescribe text', text);
assert(text.description, `failed to describe element at [${center}]`);
resultPrompt = text.description;
verifyResult = await this.verifyLocator(
resultPrompt,
deepThink ? { deepThink: true } : undefined,
center,
opt,
);
if (verifyResult.pass) {
success = true;
} else {
retryCount++;
}
}
return {
prompt: resultPrompt,
deepThink,
verifyResult,
};
}
async verifyLocator(
prompt: string,
locateOpt: LocateOption | undefined,
expectCenter: [number, number],
verifyLocateOption?: LocatorValidatorOption,
): Promise<LocateValidatorResult> {
debug('verifyLocator', prompt, locateOpt, expectCenter, verifyLocateOption);
const { center: verifyCenter, rect: verifyRect } = await this.aiLocate(
prompt,
locateOpt,
);
const distance = distanceOfTwoPoints(expectCenter, verifyCenter);
const included = includedInRect(expectCenter, verifyRect);
const pass =
distance <= (verifyLocateOption?.centerDistanceThreshold || 20) ||
included;
const verifyResult = {
pass,
rect: verifyRect,
center: verifyCenter,
centerDistance: distance,
};
debug('aiDescribe verifyResult', verifyResult);
return verifyResult;
}
async aiLocate(prompt: string, opt?: LocateOption) {
const detailedLocateParam = this.buildDetailedLocateParam(prompt, opt);
const plans = buildPlans('Locate', detailedLocateParam);

View File

@ -217,6 +217,7 @@ export class PageTaskExecutor {
const shotTime = Date.now();
const pageContext = await this.insight.contextRetrieverFn('locate');
task.pageContext = pageContext;
const recordItem: ExecutionRecorderItem = {
type: 'screenshot',
ts: shotTime,

View File

@ -159,6 +159,32 @@ describe(
3 * 60 * 1000,
);
it('element describer', async () => {
const { originPage, reset } = await launchPage('https://www.taobao.com/');
resetFn = reset;
const agent = new PuppeteerAgent(originPage);
const { center } = await agent.aiLocate('the search bar');
const describeResult = await agent.describeElementAtPoint(center);
expect(describeResult.verifyResult?.pass).toBe(true);
expect(describeResult.verifyResult?.rect).toBeTruthy();
expect(describeResult.verifyResult?.center).toBeTruthy();
});
it('element describer - deep think', async () => {
const { originPage, reset } = await launchPage('https://www.taobao.com/');
resetFn = reset;
const agent = new PuppeteerAgent(originPage);
const { center } = await agent.aiLocate('the "search" button');
const describeResult = await agent.describeElementAtPoint(center, {
deepThink: true,
});
expect(describeResult.verifyResult?.pass).toBe(true);
expect(describeResult.verifyResult?.rect).toBeTruthy();
expect(describeResult.verifyResult?.center).toBeTruthy();
});
it('scroll', async () => {
const htmlPath = path.join(__dirname, 'scroll.html');
const { originPage, reset } = await launchPage(`file://${htmlPath}`);