feat(core): element describer (#748)

* feat(core): add element describer * feat(core): add element describer * chore(core): add test cases * feat(core): put language settings into env * fix(core): edge case for annotation * chore(core): update describe settings * chore(core): fix lint * fix(core): remove unused cases * feat(core): add describer widget * feat(core): move describer to agent * feat(core): update describer prompt * feat(core): update describer prompt * feat(core): add describer tool * feat(core): add deepThink for describer * fix(core): describer widget * chore(core): fix lint * docs(core): docs for MIDSCENE_PREFERRED_LANGUAGE * feat(core): set context in locator dump
2025-06-26 23:30:04 +00:00 · 2025-05-21 20:58:37 +08:00 · 2025-05-21 20:58:37 +08:00 · 01b3576abd
commit 01b3576abd
parent 610e7979bf
45 changed files with 925 additions and 332 deletions
--- a/apps/report/src/components/open-in-playground.tsx
+++ b/apps/report/src/components/open-in-playground.tsx
@ -1,8 +1,15 @@
 import { PlayCircleOutlined } from '@ant-design/icons';
 import type { UIContext } from '@midscene/core';
-import { useStaticPageAgent } from '@midscene/visualizer';
+import { Describer, useStaticPageAgent } from '@midscene/visualizer';
 import type { WebUIContext } from '@midscene/web/utils';
-import { Button, Drawer, Tooltip } from 'antd';
+import {
+  Button,
+  ConfigProvider,
+  Drawer,
+  Tabs,
+  type TabsProps,
+  Tooltip,
+} from 'antd';
 import { useEffect, useState } from 'react';
 import { StandardPlayground } from './playground';
 import { useEnvConfig } from './store';
@ -26,6 +33,11 @@ const checkServerStatus = async () => {
  }
 };

+const tabKeys = {
+  PLAYGROUND: 'playground',
+  ELEMENT_DESCRIBER: 'element-describer',
+};
+
 export const useServerValid = (shouldRun = true) => {
  const [serverValid, setServerValid] = useState(false);
  const { serviceMode } = useEnvConfig();
@ -79,6 +91,63 @@ export default function OpenInPlayground(props?: { context?: UIContext }) {
  };
  const agent = useStaticPageAgent(context as WebUIContext);

+  const tabItems: TabsProps['items'] = [
+    {
+      key: tabKeys.PLAYGROUND,
+      label: 'Playground',
+    },
+    ...(location.href.indexOf('beta') >= 0
+      ? [
+          {
+            key: tabKeys.ELEMENT_DESCRIBER,
+            label: 'Element Describer (Beta)',
+          },
+        ]
+      : []),
+  ];
+
+  const [activeTab, setActiveTab] = useState(tabKeys.PLAYGROUND);
+
+  let toolContent: React.ReactNode;
+  if (activeTab === tabKeys.PLAYGROUND) {
+    toolContent = (
+      <StandardPlayground
+        getAgent={() => {
+          return agent;
+        }}
+        dryMode={true}
+        hideLogo={true}
+        key={contextLoadingCounter}
+      />
+    );
+  } else if (activeTab === tabKeys.ELEMENT_DESCRIBER) {
+    if (context) {
+      toolContent = (
+        <Describer uiContext={context} key={contextLoadingCounter} />
+      );
+    } else {
+      toolContent = <div>No context found</div>;
+    }
+  }
+
+  const tabComponent = (
+    <ConfigProvider
+      theme={{
+        components: {
+          Tabs: {
+            horizontalMargin: '0 0 -1px 10px',
+          },
+        },
+      }}
+    >
+      <Tabs
+        defaultActiveKey={activeTab}
+        items={tabItems}
+        onChange={setActiveTab}
+      />
+    </ConfigProvider>
+  );
+
  if (!ifPlaygroundValid) {
    return (
      <Tooltip
@ -109,25 +178,18 @@ export default function OpenInPlayground(props?: { context?: UIContext }) {
        Open in Playground
      </Button>
      <Drawer
-        title="Playground"
+        title={tabComponent}
        placement="right"
        onClose={handleClose}
        open={isDrawerVisible}
        width="90%"
        styles={{
-          header: { padding: '16px' },
+          header: { padding: '0 16px' },
          body: { padding: '24px' },
        }}
        className="playground-drawer"
      >
-        <StandardPlayground
-          getAgent={() => {
-            return agent;
-          }}
-          dryMode={true}
-          hideLogo={true}
-          key={contextLoadingCounter}
-        />
+        {toolContent}
      </Drawer>
    </>
  );
--- a/apps/report/test-data/taobao.json
+++ b/apps/report/test-data/taobao.json
--- a/apps/site/docs/en/model-provider.mdx
+++ b/apps/site/docs/en/model-provider.mdx
@ -46,6 +46,7 @@ Some advanced configs are also supported. Usually you don't need to use them.
 | `OPENAI_USE_AZURE` | Optional. Set to "true" to use Azure OpenAI Service. See more details in the following section. |
 | `MIDSCENE_OPENAI_INIT_CONFIG_JSON` | Optional. Custom JSON config for OpenAI SDK initialization |
 | `MIDSCENE_OPENAI_SOCKS_PROXY` | Optional. Proxy configuration (e.g. "socks5://127.0.0.1:1080") |
+| `MIDSCENE_PREFERRED_LANGUAGE` | Optional. The preferred language for the model response. The default is `Chinese` if the current timezone is GMT+8 and `English` otherwise. |
 | `OPENAI_MAX_TOKENS` | Optional. Maximum tokens for model response |

 ### Debug configs
--- a/apps/site/docs/zh/model-provider.mdx
+++ b/apps/site/docs/zh/model-provider.mdx
@ -50,6 +50,7 @@ Midscene 默认集成了 OpenAI SDK 调用 AI 服务。使用这个 SDK 限定
 | `OPENAI_USE_AZURE` | 可选。设置为 "true" 以使用 Azure OpenAI Service。更多详情请参阅后文 |
 | `MIDSCENE_OPENAI_INIT_CONFIG_JSON` | 可选。OpenAI SDK 的初始化配置 JSON |
 | `MIDSCENE_OPENAI_SOCKS_PROXY` | 可选。代理配置 (如 "socks5://127.0.0.1:1080") |
+| `MIDSCENE_PREFERRED_LANGUAGE` | 可选。模型响应的语言。如果当前时区是 GMT+8 则默认是 `Chinese`，否则是 `English` |
 | `OPENAI_MAX_TOKENS` | 可选。模型响应的 max_tokens 数 |

 ### 调试配置
--- a/packages/core/src/ai-model/common.ts
+++ b/packages/core/src/ai-model/common.ts
@ -39,6 +39,7 @@ export enum AIActionType {
  INSPECT_ELEMENT = 1,
  EXTRACT_DATA = 2,
  PLAN = 3,
+  DESCRIBE_ELEMENT = 4,
 }

 export async function callAiFn<T>(
--- a/packages/core/src/ai-model/inspect.ts
+++ b/packages/core/src/ai-model/inspect.ts
@ -11,6 +11,7 @@ import type {
  ElementById,
  ElementTreeNode,
  Rect,
+  ReferenceImage,
  UIContext,
 } from '@/types';
 import {
@ -73,6 +74,7 @@ export async function AiLocateElement<
 >(options: {
  context: UIContext<ElementType>;
  targetElementDescription: string;
+  referenceImage?: ReferenceImage;
  callAI?: typeof callAiFn<AIElementResponse | [number, number]>;
  searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;
 }): Promise<{
@ -121,6 +123,15 @@ export async function AiLocateElement<
    );
  }

+  let referenceImagePayload: string | undefined;
+  if (options.referenceImage?.rect && options.referenceImage.base64) {
+    referenceImagePayload = await cropByRect(
+      options.referenceImage.base64,
+      options.referenceImage.rect,
+      getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL),
+    );
+  }
+
  const msgs: AIArgs = [
    { role: 'system', content: systemPrompt },
    {
--- a/packages/core/src/ai-model/prompt/assertion.ts
+++ b/packages/core/src/ai-model/prompt/assertion.ts
@ -1,7 +1,8 @@
+import { getPreferredLanguage } from '@midscene/shared/env';
 import type { ResponseFormatJSONSchema } from 'openai/resources';
-import { getTimeZoneInfo } from './ui-tars-planning';

-export const language = getTimeZoneInfo().isChina ? 'Chinese' : 'English';
+const preferredLanguage = getPreferredLanguage();
+
 const defaultAssertionPrompt =
  'You are a senior testing engineer. User will give an assertion and a screenshot of a page. By carefully viewing the screenshot, please tell whether the assertion is truthy.';

@ -21,7 +22,7 @@ const uiTarsAssertionResponseJsonFormat = `## Output Json String Format

 ## Rules **MUST** follow
 - Make sure to return **only** the JSON, with **no additional** text or explanations.
- Use ${language} in \`thought\` part.
+- Use ${preferredLanguage} in \`thought\` part.
 - You **MUST** strictly follow up the **Output Json String Format**.`;

 export function systemPromptToAssert(model: { isUITars: boolean }) {
--- a/packages/core/src/ai-model/prompt/describe.ts
+++ b/packages/core/src/ai-model/prompt/describe.ts
@ -0,0 +1,26 @@
+import { getPreferredLanguage } from '@midscene/shared/env';
+
+const preferredLanguage = getPreferredLanguage();
+
+export const elementDescriberInstruction = () => {
+  return `Tell what is the content of the element wrapped by the read rectangle in the screenshot. Your description is expected to be used to precisely locate the element from other similar elements on screenshot. Use ${preferredLanguage} in the description.
+
+Please follow the following rules:
+1. The description should be start with a brief description, like "a button for confirming the action".
+
+2. Include these information in the description to distinguish the element from its siblings and other similar elements, as much as possible:
+- The text of the element, like "with text 'Confirm'"
+- What the element looks like if it's an image, like "with image '...'"
+- The relative position of the element, like "on the left of ..., around ..."
+- How to distinguish the element from its siblings elements, like "it is the icon instead of the text"
+
+3. Do NOT mention the red rectangle in the description.
+
+4. Use the error field to describe the unexpected situations, if any. If not, put null.
+
+Return in JSON:
+{
+  "description": "[{brief description}]: {text of the element} {image of the element} {relative position of the element} ... ",
+  "error"?: "..."
+}`;
+};
--- a/packages/core/src/ai-model/prompt/ui-tars-planning.ts
+++ b/packages/core/src/ai-model/prompt/ui-tars-planning.ts
@ -1,19 +1,7 @@
-export function getTimeZoneInfo(): { timezone: string; isChina: boolean } {
-  const timeZone = Intl.DateTimeFormat().resolvedOptions().timeZone;
-  const offset = -new Date().getTimezoneOffset() / 60;
-
-  return {
-    timezone: `UTC${offset >= 0 ? '+' : ''}${offset}`,
-    isChina: timeZone === 'Asia/Shanghai',
-  };
-}
-
-export function getLanguage(): string {
-  return getTimeZoneInfo().isChina ? 'Chinese' : 'English';
-}
+import { getPreferredLanguage } from '@midscene/shared/env';

 export function getUiTarsPlanningPrompt(): string {
-  const language = getLanguage();
+  const preferredLanguage = getPreferredLanguage();

  return `
 You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. 
@ -38,7 +26,7 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par


 ## Note
- Use ${language} in \`Thought\` part.
+- Use ${preferredLanguage} in \`Thought\` part.
 - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.

 ## User Instruction
--- a/packages/core/src/ai-model/service-caller/index.ts
+++ b/packages/core/src/ai-model/service-caller/index.ts
@ -253,13 +253,13 @@ export async function call(
  let content: string | undefined;
  let usage: OpenAI.CompletionUsage | undefined;
  const commonConfig = {
-    temperature: getAIConfigInBoolean(MIDSCENE_USE_VLM_UI_TARS) ? 0.0 : 0.1,
+    temperature: vlLocateMode() === 'vlm-ui-tars' ? 0.0 : 0.1,
    stream: false,
    max_tokens:
      typeof maxTokens === 'number'
        ? maxTokens
        : Number.parseInt(maxTokens || '2048', 10),
-    ...(getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL) // qwen specific config
+    ...(vlLocateMode() === 'qwen-vl' // qwen specific config
      ? {
          vl_high_resolution_images: true,
        }
@ -359,14 +359,13 @@ export async function callToGetJSONObject<T>(
      case AIActionType.INSPECT_ELEMENT:
        responseFormat = locatorSchema;
        break;
-      case AIActionType.EXTRACT_DATA:
-        //TODO: Currently the restriction type can only be a json subset of the constraint, and the way the extract api is used needs to be adjusted to limit the user's data to this as well
-        // targetResponseFormat = extractDataSchema;
-        responseFormat = { type: AIResponseFormat.JSON };
-        break;
      case AIActionType.PLAN:
        responseFormat = planSchema;
        break;
+      case AIActionType.EXTRACT_DATA:
+      case AIActionType.DESCRIBE_ELEMENT:
+        responseFormat = { type: AIResponseFormat.JSON };
+        break;
    }
  }

--- a/packages/core/src/insight/index.ts
+++ b/packages/core/src/insight/index.ts
@ -1,7 +1,18 @@
-import { callAiFn } from '@/ai-model/common';
-import { AiExtractElementInfo, AiLocateElement } from '@/ai-model/index';
+import {
+  AIActionType,
+  type AIArgs,
+  callAiFn,
+  expandSearchArea,
+} from '@/ai-model/common';
+import {
+  AiExtractElementInfo,
+  AiLocateElement,
+  callToGetJSONObject,
+} from '@/ai-model/index';
 import { AiAssert, AiLocateSection } from '@/ai-model/inspect';
+import { elementDescriberInstruction } from '@/ai-model/prompt/describe';
 import type {
+  AIDescribeElementResponse,
  AIElementResponse,
  AISingleElementResponse,
  AIUsageInfo,
@ -20,9 +31,11 @@ import type {
 } from '@/types';
 import {
  MIDSCENE_FORCE_DEEP_THINK,
+  MIDSCENE_USE_QWEN_VL,
  getAIConfigInBoolean,
  vlLocateMode,
 } from '@midscene/shared/env';
+import { compositeElementInfoImg, cropByRect } from '@midscene/shared/img';
 import { getDebug } from '@midscene/shared/logger';
 import { assert } from '@midscene/shared/utils';
 import { emitInsightDump } from './utils';
@ -327,4 +340,74 @@ export default class Insight<
      usage: assertResult.usage,
    };
  }
+  async describe(
+    target: Rect | [number, number],
+    opt?: {
+      deepThink?: boolean;
+    },
+  ): Promise<Pick<AIDescribeElementResponse, 'description'>> {
+    assert(target, 'target is required for insight.describe');
+    const context = await this.contextRetrieverFn('describe');
+    const { screenshotBase64 } = context;
+    assert(screenshotBase64, 'screenshot is required for insight.describe');
+
+    const systemPrompt = elementDescriberInstruction();
+
+    // Convert [x,y] center point to Rect if needed
+    const defaultRectSize = 30;
+    const targetRect: Rect = Array.isArray(target)
+      ? {
+          left: Math.floor(target[0] - defaultRectSize / 2),
+          top: Math.floor(target[1] - defaultRectSize / 2),
+          width: defaultRectSize,
+          height: defaultRectSize,
+        }
+      : target;
+
+    let imagePayload = await compositeElementInfoImg({
+      inputImgBase64: screenshotBase64,
+      elementsPositionInfo: [
+        {
+          rect: targetRect,
+        },
+      ],
+      borderThickness: 3,
+    });
+
+    if (opt?.deepThink) {
+      const searchArea = expandSearchArea(targetRect, context.size);
+      debug('describe: set searchArea', searchArea);
+      imagePayload = await cropByRect(
+        imagePayload,
+        searchArea,
+        getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL),
+      );
+    }
+
+    const msgs: AIArgs = [
+      { role: 'system', content: systemPrompt },
+      {
+        role: 'user',
+        content: [
+          {
+            type: 'image_url',
+            image_url: {
+              url: imagePayload,
+              detail: 'high',
+            },
+          },
+        ],
+      },
+    ];
+
+    const callAIFn =
+      this.aiVendorFn || callToGetJSONObject<AIDescribeElementResponse>;
+
+    const res = await callAIFn(msgs, AIActionType.DESCRIBE_ELEMENT);
+
+    const { content } = res;
+    assert(!content.error, `describe failed: ${content.error}`);
+    assert(content.description, 'failed to describe the element');
+    return content;
+  }
 }
--- a/packages/core/src/types.ts
+++ b/packages/core/src/types.ts
@ -119,6 +119,28 @@ export interface AIAssertionResponse {
  thought: string;
 }

+export interface AIDescribeElementResponse {
+  description: string;
+  error?: string;
+}
+
+export interface LocatorValidatorOption {
+  centerDistanceThreshold?: number;
+}
+
+export interface LocateValidatorResult {
+  pass: boolean;
+  rect: Rect;
+  center: [number, number];
+  centerDistance?: number;
+}
+
+export interface AgentDescribeElementAtPointResult {
+  prompt: string;
+  deepThink: boolean;
+  verifyResult?: LocateValidatorResult;
+}
+
 /**
 * context
 */
@ -157,7 +179,7 @@ export interface InsightOptions {

 export type EnsureObject<T> = { [K in keyof T]: any };

-export type InsightAction = 'locate' | 'extract' | 'assert';
+export type InsightAction = 'locate' | 'extract' | 'assert' | 'describe';

 export type InsightExtractParam = string | Record<string, string>;

--- a/packages/core/src/yaml.ts
+++ b/packages/core/src/yaml.ts
@ -1,4 +1,4 @@
-import type { PlanningActionParamScroll } from './types';
+import type { PlanningActionParamScroll, Rect } from './types';

 export interface LocateOption {
  prompt?: string;
@ -6,8 +6,14 @@ export interface LocateOption {
  cacheable?: boolean; // user can set this param to false to disable the cache for a single agent api
 }

+export interface ReferenceImage {
+  base64: string;
+  rect?: Rect;
+}
+
 export interface DetailedLocateParam extends LocateOption {
  prompt: string;
+  referenceImage?: ReferenceImage;
 }

 export interface scrollParam {
--- a/packages/core/tests/ai/insight/insight.test.ts
+++ b/packages/core/tests/ai/insight/insight.test.ts
@ -25,7 +25,7 @@ describe.skipIf(!vlMode)('insight locate with deep think', () => {
    await sleep(3000);
  });

-  test('insight locate with search area and think twice', async () => {
+  test('insight locate with search area - deep think', async () => {
    const { context } = await getContextFromFixture('taobao');

    const insight = new Insight(context);
@ -66,3 +66,26 @@ test.skip('insight locate with search area', async () => {
  console.log(element, rect);
  await sleep(3000);
 });
+
+describe('insight describe', () => {
+  test('insight describe - by rect', async () => {
+    const { context } = await getContextFromFixture('taobao');
+    const insight = new Insight(context);
+    const { description } = await insight.describe({
+      left: 580,
+      top: 140,
+      width: 80,
+      height: 30,
+    });
+
+    expect(description).toBeDefined();
+  });
+
+  test('insight describe - by center point', async () => {
+    const { context } = await getContextFromFixture('taobao');
+    const insight = new Insight(context);
+    const { description } = await insight.describe([580, 140]);
+
+    expect(description).toBeDefined();
+  });
+});
--- a/packages/core/tests/unit-test/prompt/snapshots/describe.test.ts.snap
+++ b/packages/core/tests/unit-test/prompt/snapshots/describe.test.ts.snap
@ -0,0 +1,24 @@
+// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
+
+exports[`elementDescriberInstruction > should return the correct instruction 1`] = `
+"Tell what is the content of the element wrapped by the read rectangle in the screenshot. Your description is expected to be used to precisely locate the element from other similar elements on screenshot. Use English in the description.
+
+Please follow the following rules:
+1. The description should be start with a brief description, like "a button for confirming the action".
+
+2. Include these information in the description to distinguish the element from its siblings and other similar elements, as much as possible:
+- The text of the element, like "with text 'Confirm'"
+- What the element looks like if it's an image, like "with image '...'"
+- The relative position of the element, like "on the left of ..., around ..."
+- How to distinguish the element from its siblings elements, like "it is the icon instead of the text"
+
+3. Do NOT mention the red rectangle in the description.
+
+4. Use the error field to describe the unexpected situations, if any. If not, put null.
+
+Return in JSON:
+{
+  "description": "[{brief description}]: {text of the element} {image of the element} {relative position of the element} ... ",
+  "error"?: "..."
+}"
+`;
--- a/packages/core/tests/unit-test/prompt/assertion.test.ts
+++ b/packages/core/tests/unit-test/prompt/assertion.test.ts
@ -2,18 +2,17 @@ import { systemPromptToAssert } from '@/ai-model/prompt/assertion';
 import { describe, expect, it, vi } from 'vitest';

 describe('Assertion prompt', () => {
+  vi.mock('@midscene/shared/env', () => ({
+    getPreferredLanguage: vi.fn().mockReturnValue('English'),
+  }));
+
  it('return default when it is not UI-Tars', () => {
    const prompt = systemPromptToAssert({ isUITars: false });
    expect(prompt).toMatchSnapshot();
  });

  it('return UI-Tars specific when it is UI-Tars', () => {
-    vi.mock('@/ai-model/prompt/ui-tars-planning', () => ({
-      getTimeZoneInfo: vi.fn().mockReturnValue({ isChina: false }),
-    }));
-
    const prompt = systemPromptToAssert({ isUITars: true });
-
    expect(prompt).toMatchSnapshot();
  });
 });
--- a/packages/core/tests/unit-test/prompt/describe.test.ts
+++ b/packages/core/tests/unit-test/prompt/describe.test.ts
@ -0,0 +1,12 @@
+import { elementDescriberInstruction } from '@/ai-model/prompt/describe';
+import { describe, expect, it, vi } from 'vitest';
+
+describe('elementDescriberInstruction', () => {
+  vi.mock('@midscene/shared/env', () => ({
+    getPreferredLanguage: vi.fn().mockReturnValue('English'),
+  }));
+
+  it('should return the correct instruction', () => {
+    expect(elementDescriberInstruction()).toMatchSnapshot();
+  });
+});
--- a/packages/core/tests/unit-test/prompt/ui-tars-planning.test.ts
+++ b/packages/core/tests/unit-test/prompt/ui-tars-planning.test.ts
@ -1,32 +0,0 @@
-import {
-  getLanguage,
-  getTimeZoneInfo,
-} from '@/ai-model/prompt/ui-tars-planning';
-import { afterEach, describe, expect, it } from 'vitest';
-import { mockNonChinaTimeZone, restoreIntl } from '../mocks/intl-mock';
-
-describe('UI TARS Planning Functions', () => {
-  afterEach(() => {
-    restoreIntl();
-  });
-
-  it('getTimeZoneInfo returns original timezone without mock', () => {
-    // This test will vary based on the system running it
-    const info = getTimeZoneInfo();
-    // We don't assert on specific values here as they depend on the local environment
-    expect(info).toHaveProperty('timezone');
-    expect(info).toHaveProperty('isChina');
-    expect(typeof info.timezone).toBe('string');
-    expect(typeof info.isChina).toBe('boolean');
-  });
-
-  it('getTimeZoneInfo returns non-China timezone with mock', () => {
-    mockNonChinaTimeZone();
-
-    const info = getTimeZoneInfo();
-    expect(info.isChina).toBe(false);
-
-    const language = getLanguage();
-    expect(language).toBe('English');
-  });
-});
--- a/packages/evaluation/page-cases/inspect/antd-carousel.json
+++ b/packages/evaluation/page-cases/inspect/antd-carousel.json
@ -5,52 +5,49 @@
      "prompt": "'最简单的用法'下方有五个 icon，左侧第一个 icon",
      "annotation_index_id": 1,
      "response_rect": {
-        "left": 486,
-        "top": 859,
-        "width": 21,
-        "height": 14
+        "left": 538,
+        "top": 769,
+        "width": 12,
+        "height": 13
      },
      "response_element": {
-        "id": "nkpld",
-        "indexId": 104
+        "id": "klhmg"
      }
    },
    {
      "prompt": "'最简单的用法'下方有五个 icon，左侧第二个 icon",
      "annotation_index_id": 2,
      "response_rect": {
-        "left": 519,
-        "top": 860,
-        "width": 15,
-        "height": 12
+        "left": 568,
+        "top": 739,
+        "width": 16,
+        "height": 16
      },
      "response_element": {
-        "id": "hdbbh",
-        "indexId": 105
+        "id": "mflch"
      }
    },
    {
      "prompt": "'最简单的用法'下方有五个 icon，左侧第三个 icon",
      "annotation_index_id": 3,
      "response_rect": {
-        "left": 549,
-        "top": 860,
-        "width": 18,
+        "left": 538,
+        "top": 769,
+        "width": 12,
        "height": 12
      },
      "response_element": {
-        "id": "ncono",
-        "indexId": 106
+        "id": "kbgij"
      }
    },
    {
      "prompt": "'最简单的用法'下方有五个 icon，左侧第四个 icon",
      "annotation_index_id": 4,
      "response_rect": {
-        "left": 584,
+        "left": 583,
        "top": 860,
-        "width": 13,
-        "height": 12
+        "width": 14,
+        "height": 14
      },
      "response_element": {
        "id": "jkeam",
@ -61,70 +58,67 @@
      "prompt": "'最简单的用法'下方有五个 icon，最右侧的 icon",
      "annotation_index_id": 5,
      "response_rect": {
-        "left": 617,
-        "top": 862,
-        "width": 13,
-        "height": 9
+        "left": 587,
+        "top": 863,
+        "width": 14,
+        "height": 11
      },
      "response_element": {
-        "id": "nnkcf",
-        "indexId": 108
+        "id": "jkeam",
+        "indexId": 107
      }
    },
    {
      "prompt": "全屏幕右上角、版本号右侧有三个 icon ，查找左侧第一个",
      "annotation_index_id": 6,
      "response_rect": {
-        "left": 1269,
-        "top": 24,
-        "width": 20,
-        "height": 13
+        "left": 876,
+        "top": 35,
+        "width": 48,
+        "height": 26
      },
      "response_element": {
-        "id": "dinoj",
-        "indexId": 13
+        "id": "jmdcl"
      }
    },
    {
      "prompt": "全屏幕右上角有三个 icon ，查找左侧第二个",
      "annotation_index_id": 7,
      "response_rect": {
-        "left": 1309,
-        "top": 24,
-        "width": 26,
-        "height": 16
+        "left": 876,
+        "top": 35,
+        "width": 34,
+        "height": 7
      },
      "response_element": {
-        "id": "nfpha",
-        "indexId": 14
+        "id": "hdpjf"
      }
    },
    {
      "prompt": "屏幕右上角有三个 icon ，左侧第三个",
      "annotation_index_id": 8,
      "response_rect": {
-        "left": 1356,
-        "top": 24,
-        "width": 20,
-        "height": 16
+        "left": 867,
+        "top": 35,
+        "width": 59,
+        "height": 29
      },
      "response_element": {
-        "id": "hmbld",
-        "indexId": 15
+        "id": "jnobj"
      }
    },
    {
      "prompt": "在‘代码演示’右侧有三个 icon 按钮中，查找最中间的按钮",
      "annotation_index_id": 9,
      "response_rect": {
-        "left": 1184,
-        "top": 497,
-        "width": 16,
-        "height": 16
+        "left": 863,
+        "top": 574,
+        "width": 216,
+        "height": 33
      },
      "response_element": {
-        "id": "pkafb",
-        "indexId": 94
+        "id": "mjcce",
+        "indexId": 112
      }
    }
  ]
--- a/packages/evaluation/page-cases/inspect/antd-carousel.json-coordinates-annotated.png
+++ b/packages/evaluation/page-cases/inspect/antd-carousel.json-coordinates-annotated.png
--- a/packages/evaluation/page-cases/inspect/aweme-login.json
+++ b/packages/evaluation/page-cases/inspect/aweme-login.json
@ -6,9 +6,9 @@
      "multi": false,
      "annotation_index_id": 1,
      "response_rect": {
-        "left": 721,
-        "top": 245,
-        "width": 72,
+        "left": 723,
+        "top": 246,
+        "width": 86,
        "height": 15
      },
      "response_element": {
@ -21,10 +21,10 @@
      "multi": false,
      "annotation_index_id": 2,
      "response_rect": {
-        "left": 485,
-        "top": 246,
-        "width": 72,
-        "height": 15
+        "left": 486,
+        "top": 239,
+        "width": 84,
+        "height": 28
      },
      "response_element": {
        "id": "aonmh",
@ -36,14 +36,13 @@
      "multi": false,
      "annotation_index_id": 3,
      "response_rect": {
-        "left": 492,
-        "top": 341,
-        "width": 294,
-        "height": 49
+        "left": 493,
+        "top": 367,
+        "width": 308,
+        "height": 25
      },
      "response_element": {
-        "id": "mfodf",
-        "indexId": 10
+        "id": "aelca"
      }
    },
    {
@ -51,14 +50,13 @@
      "multi": false,
      "annotation_index_id": 4,
      "response_rect": {
-        "left": 492,
-        "top": 418,
-        "width": 294,
-        "height": 50
+        "left": 487,
+        "top": 365,
+        "width": 303,
+        "height": 47
      },
      "response_element": {
-        "id": "nhbof",
-        "indexId": 12
+        "id": "dahmc"
      }
    },
    {
@ -67,9 +65,9 @@
      "annotation_index_id": 5,
      "response_rect": {
        "left": 697,
-        "top": 435,
-        "width": 71,
-        "height": 14
+        "top": 438,
+        "width": 68,
+        "height": 13
      },
      "response_element": {
        "id": "kdbdc",
@ -81,10 +79,10 @@
      "multi": false,
      "annotation_index_id": 6,
      "response_rect": {
-        "left": 492,
-        "top": 558,
-        "width": 294,
-        "height": 45
+        "left": 603,
+        "top": 574,
+        "width": 86,
+        "height": 18
      },
      "response_element": {
        "id": "bjnpl",
@ -96,10 +94,10 @@
      "multi": false,
      "annotation_index_id": 7,
      "response_rect": {
-        "left": 845,
-        "top": 120,
-        "width": 15,
-        "height": 15
+        "left": 846,
+        "top": 123,
+        "width": 11,
+        "height": 16
      },
      "response_element": {
        "id": "aigcl",
--- a/packages/evaluation/page-cases/inspect/aweme-login.json-coordinates-annotated.png
+++ b/packages/evaluation/page-cases/inspect/aweme-login.json-coordinates-annotated.png
--- a/packages/evaluation/page-cases/inspect/aweme-play.json
+++ b/packages/evaluation/page-cases/inspect/aweme-play.json
@ -5,9 +5,9 @@
      "prompt": "左下角暂停按钮",
      "annotation_index_id": 1,
      "response_rect": {
-        "left": 17,
+        "left": 8,
        "top": 769,
-        "width": 15,
+        "width": 27,
        "height": 15
      },
      "response_element": {
@ -19,70 +19,65 @@
      "prompt": "点赞（爱心）按钮",
      "annotation_index_id": 2,
      "response_rect": {
-        "left": 1204,
-        "top": 352,
-        "width": 23,
-        "height": 19
+        "left": 1348,
+        "top": 569,
+        "width": 24,
+        "height": 21
      },
      "response_element": {
-        "id": "ebgie",
-        "indexId": 23
+        "id": "bomgc"
      }
    },
    {
      "prompt": "评论按钮",
      "annotation_index_id": 3,
      "response_rect": {
-        "left": 1205,
-        "top": 426,
-        "width": 22,
-        "height": 17
+        "left": 1346,
+        "top": 578,
+        "width": 46,
+        "height": 42
      },
      "response_element": {
-        "id": "cjmim",
-        "indexId": 25
+        "id": "hiono"
      }
    },
    {
      "prompt": "书签收藏按钮",
      "annotation_index_id": 4,
      "response_rect": {
-        "left": 1203,
-        "top": 498,
-        "width": 27,
-        "height": 22
+        "left": -1,
+        "top": -3598647,
+        "width": 1285,
+        "height": 3599459
      },
      "response_element": {
-        "id": "moimk",
-        "indexId": 27
+        "id": "fkleb"
      }
    },
    {
      "prompt": "分享按钮",
      "annotation_index_id": 5,
      "response_rect": {
-        "left": 1203,
-        "top": 576,
-        "width": 25,
-        "height": 18
+        "left": -3568497,
+        "top": -3568497,
+        "width": 0,
+        "height": 0
      },
      "response_element": {
-        "id": "mgcne",
-        "indexId": 29
+        "id": "dcppg"
      }
    },
    {
      "prompt": "右下角区域声音按钮",
      "annotation_index_id": 6,
      "response_rect": {
-        "left": 1203,
-        "top": 769,
-        "width": 22,
-        "height": 15
+        "left": 1176,
+        "top": 768,
+        "width": 27,
+        "height": 26
      },
      "response_element": {
-        "id": "djknm",
-        "indexId": 10
+        "id": "eodda"
      }
    }
  ]
--- a/packages/evaluation/page-cases/inspect/aweme-play.json-coordinates-annotated.png
+++ b/packages/evaluation/page-cases/inspect/aweme-play.json-coordinates-annotated.png
--- a/packages/evaluation/page-cases/inspect/online_order.json
+++ b/packages/evaluation/page-cases/inspect/online_order.json
@ -6,10 +6,10 @@
      "multi": false,
      "annotation_index_id": 1,
      "response_rect": {
-        "left": 17,
-        "top": 20,
-        "width": 22,
-        "height": 16
+        "left": 9,
+        "top": 18,
+        "width": 34,
+        "height": 39
      },
      "response_element": {
        "id": "amjle",
@ -21,14 +21,13 @@
      "multi": false,
      "annotation_index_id": 2,
      "response_rect": {
-        "left": 58,
-        "top": 16,
-        "width": 66,
-        "height": 23
+        "left": 95,
+        "top": 34,
+        "width": 83,
+        "height": 6
      },
      "response_element": {
-        "id": "kfmhg",
-        "indexId": 1
+        "id": "mbdlb"
      }
    },
    {
@ -36,10 +35,10 @@
      "multi": false,
      "annotation_index_id": 3,
      "response_rect": {
-        "left": 352,
-        "top": 19,
-        "width": 20,
-        "height": 17
+        "left": 354,
+        "top": 18,
+        "width": 18,
+        "height": 21
      },
      "response_element": {
        "id": "podpa",
@ -51,14 +50,13 @@
      "multi": false,
      "annotation_index_id": 4,
      "response_rect": {
-        "left": 190,
-        "top": 724,
-        "width": 38,
-        "height": 19
+        "left": 348,
+        "top": 759,
+        "width": 14,
+        "height": 8
      },
      "response_element": {
-        "id": "dmggl",
-        "indexId": 20
+        "id": "eagmn"
      }
    },
    {
@ -66,10 +64,10 @@
      "multi": false,
      "annotation_index_id": 5,
      "response_rect": {
-        "left": 301,
-        "top": 864,
-        "width": 86,
-        "height": 18
+        "left": 304,
+        "top": 859,
+        "width": 83,
+        "height": 23
      },
      "response_element": {
        "id": "cdmma",
@ -81,14 +79,13 @@
      "multi": false,
      "annotation_index_id": 6,
      "response_rect": {
-        "left": 369,
-        "top": 825,
-        "width": 21,
-        "height": 16
+        "left": -348,
+        "top": -579,
+        "width": 87,
+        "height": 271
      },
      "response_element": {
-        "id": "ddeal",
-        "indexId": 27
+        "id": "dcppg"
      }
    }
  ]
--- a/packages/evaluation/page-cases/inspect/online_order.json-coordinates-annotated.png
+++ b/packages/evaluation/page-cases/inspect/online_order.json-coordinates-annotated.png
--- a/packages/evaluation/page-cases/inspect/online_order_list.json
+++ b/packages/evaluation/page-cases/inspect/online_order_list.json
@ -6,14 +6,13 @@
      "multi": false,
      "annotation_index_id": 1,
      "response_rect": {
-        "left": 187,
-        "top": 276,
-        "width": 83,
-        "height": 19
+        "left": 89,
+        "top": 167,
+        "width": 361,
+        "height": 46
      },
      "response_element": {
-        "id": "eabha",
-        "indexId": 20
+        "id": "bkggi"
      }
    },
    {
@ -21,10 +20,10 @@
      "multi": false,
      "annotation_index_id": 2,
      "response_rect": {
-        "left": 324,
-        "top": 801,
-        "width": 62,
-        "height": 16
+        "left": 348,
+        "top": 796,
+        "width": 37,
+        "height": 25
      },
      "response_element": {
        "id": "kmdfd",
@ -36,14 +35,13 @@
      "multi": false,
      "annotation_index_id": 3,
      "response_rect": {
-        "left": 189,
-        "top": 802,
-        "width": 40,
-        "height": 15
+        "left": 94,
+        "top": 765,
+        "width": 294,
+        "height": 37
      },
      "response_element": {
-        "id": "eommc",
-        "indexId": 38
+        "id": "lmngi"
      }
    },
    {
@ -51,10 +49,10 @@
      "multi": false,
      "annotation_index_id": 4,
      "response_rect": {
-        "left": 6,
-        "top": 382,
-        "width": 34,
-        "height": 13
+        "left": 4,
+        "top": 385,
+        "width": 63,
+        "height": 24
      },
      "response_element": {
        "id": "lalae",
--- a/packages/evaluation/page-cases/inspect/online_order_list.json-coordinates-annotated.png
+++ b/packages/evaluation/page-cases/inspect/online_order_list.json-coordinates-annotated.png
--- a/packages/evaluation/page-cases/inspect/taobao.json
+++ b/packages/evaluation/page-cases/inspect/taobao.json
@ -6,13 +6,14 @@
      "multi": false,
      "annotation_index_id": 1,
      "response_rect": {
-        "left": 319,
-        "top": 54,
-        "width": 533,
-        "height": 40
+        "left": 346,
+        "top": 57,
+        "width": 546,
+        "height": 34
      },
      "response_element": {
-        "id": "aljah"
+        "id": "jfjah",
+        "indexId": 27
      }
    },
    {
@ -20,10 +21,10 @@
      "multi": false,
      "annotation_index_id": 2,
      "response_rect": {
-        "left": 792,
-        "top": 65,
-        "width": 38,
-        "height": 16
+        "left": 803,
+        "top": 64,
+        "width": 22,
+        "height": 15
      },
      "response_element": {
        "id": "ondpi",
@ -36,14 +37,14 @@
      "multi": false,
      "annotation_index_id": 3,
      "response_rect": {
-        "left": 118,
-        "top": 428,
-        "width": 53,
-        "height": 12
+        "left": 89,
+        "top": 430,
+        "width": 67,
+        "height": 15
      },
      "response_element": {
-        "id": "cjfcl",
-        "indexId": 99
+        "id": "hgioh",
+        "indexId": 98
      }
    },
    {
@ -52,14 +53,13 @@
      "multi": false,
      "annotation_index_id": 4,
      "response_rect": {
-        "left": 1065,
-        "top": 385,
-        "width": 21,
-        "height": 19
+        "left": 1191,
+        "top": 389,
+        "width": 43,
+        "height": 34
      },
      "response_element": {
-        "id": "fkfdl",
-        "indexId": 190
+        "id": "hpjie"
      }
    },
    {
@ -68,29 +68,28 @@
      "multi": false,
      "annotation_index_id": 5,
      "response_rect": {
-        "left": 1251,
-        "top": 352,
-        "width": 22,
-        "height": 22
+        "left": 1158,
+        "top": 389,
+        "width": 12,
+        "height": 14
      },
      "response_element": {
-        "id": "iegkg",
-        "indexId": 212
+        "id": "knffl"
      }
    },
    {
      "prompt": "顶部工具栏的购物车 icon",
      "deepThink": true,
      "response_rect": {
-        "left": 837,
-        "top": 12,
-        "width": 17,
-        "height": 14
+        "left": 869,
+        "top": 9,
+        "width": 21,
+        "height": 16
      },
      "annotation_index_id": 6,
      "response_element": {
-        "id": "aefln",
-        "indexId": 12
+        "id": "mlkcg",
+        "indexId": 13
      }
    }
  ]
--- a/packages/evaluation/page-cases/inspect/taobao.json-coordinates-annotated.png
+++ b/packages/evaluation/page-cases/inspect/taobao.json-coordinates-annotated.png
--- a/packages/evaluation/page-cases/inspect/todo.json
+++ b/packages/evaluation/page-cases/inspect/todo.json
@ -6,14 +6,13 @@
      "multi": false,
      "annotation_index_id": 1,
      "response_rect": {
-        "left": 512,
-        "top": 127,
-        "width": 556,
-        "height": 71
+        "left": 548,
+        "top": 179,
+        "width": 55,
+        "height": 21
      },
      "response_element": {
-        "id": "okgbn",
-        "indexId": 18
+        "id": "lbjjf"
      }
    },
    {
@ -21,14 +20,13 @@
      "multi": false,
      "annotation_index_id": 2,
      "response_rect": {
-        "left": 512,
-        "top": 127,
-        "width": 556,
-        "height": 71
+        "left": -498,
+        "top": -576,
+        "width": 1701,
+        "height": 1304
      },
      "response_element": {
-        "id": "okgbn",
-        "indexId": 18
+        "id": "ffncc"
      }
    },
    {
@ -36,14 +34,13 @@
      "multi": false,
      "annotation_index_id": 3,
      "response_rect": {
-        "left": 574,
-        "top": 276,
-        "width": 117,
-        "height": 17
+        "left": 587,
+        "top": 274,
+        "width": 376,
+        "height": 24
      },
      "response_element": {
-        "id": "idmhb",
-        "indexId": 24
+        "id": "micif"
      }
    },
    {
@ -51,10 +48,10 @@
      "multi": false,
      "annotation_index_id": 4,
      "response_rect": {
-        "left": 1028,
-        "top": 279,
-        "width": 15,
-        "height": 12
+        "left": 987,
+        "top": 254,
+        "width": 76,
+        "height": 46
      },
      "response_element": {
        "id": "jicbk",
@ -66,10 +63,10 @@
      "multi": false,
      "annotation_index_id": 5,
      "response_rect": {
-        "left": 521,
-        "top": 334,
-        "width": 33,
-        "height": 26
+        "left": 527,
+        "top": 340,
+        "width": 21,
+        "height": 21
      },
      "response_element": {
        "id": "kjccf",
@ -81,10 +78,10 @@
      "multi": false,
      "annotation_index_id": 6,
      "response_rect": {
-        "left": 802,
+        "left": 804,
        "top": 391,
-        "width": 69,
-        "height": 12
+        "width": 68,
+        "height": 14
      },
      "response_element": {
        "id": "ddapc",
--- a/packages/evaluation/page-cases/inspect/todo.json-coordinates-annotated.png
+++ b/packages/evaluation/page-cases/inspect/todo.json-coordinates-annotated.png
--- a/packages/evaluation/tests/llm-locator.test.ts
+++ b/packages/evaluation/tests/llm-locator.test.ts
@ -85,11 +85,6 @@ testSources.forEach((source) => {
              indexId,
              rect,
            });
-
-            // // biome-ignore lint/performance/noDelete: <explanation>
-            // delete (testCase as any).response_bbox;
-            // // biome-ignore lint/performance/noDelete: <explanation>
-            // delete (testCase as any).response;
          }

          if (element) {
--- a/packages/shared/src/env.ts
+++ b/packages/shared/src/env.ts
@ -31,6 +31,8 @@ export const MATCH_BY_POSITION = 'MATCH_BY_POSITION';
 export const MIDSCENE_API_TYPE = 'MIDSCENE-API-TYPE';
 export const MIDSCENE_REPORT_TAG_NAME = 'MIDSCENE_REPORT_TAG_NAME';

+export const MIDSCENE_PREFERRED_LANGUAGE = 'MIDSCENE_PREFERRED_LANGUAGE';
+
 export const MIDSCENE_USE_AZURE_OPENAI = 'MIDSCENE_USE_AZURE_OPENAI';
 export const MIDSCENE_AZURE_OPENAI_SCOPE = 'MIDSCENE_AZURE_OPENAI_SCOPE';
 export const MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON =
@ -103,6 +105,8 @@ export const allConfigFromEnv = () => {
    [MIDSCENE_MCP_USE_PUPPETEER_MODE]:
      process.env[MIDSCENE_MCP_USE_PUPPETEER_MODE] || undefined,
    [MIDSCENE_RUN_DIR]: process.env[MIDSCENE_RUN_DIR] || undefined,
+    [MIDSCENE_PREFERRED_LANGUAGE]:
+      process.env[MIDSCENE_PREFERRED_LANGUAGE] || undefined,
  };
 };

@ -244,3 +248,13 @@ export const overrideAIConfig = (
    ? { ...currentConfig, ...newConfig }
    : { ...newConfig };
 };
+
+export const getPreferredLanguage = () => {
+  if (getAIConfig(MIDSCENE_PREFERRED_LANGUAGE)) {
+    return getAIConfig(MIDSCENE_PREFERRED_LANGUAGE);
+  }
+
+  const timeZone = Intl.DateTimeFormat().resolvedOptions().timeZone;
+  const isChina = timeZone === 'Asia/Shanghai';
+  return isChina ? 'Chinese' : 'English';
+};
--- a/packages/shared/src/img/box-select.ts
+++ b/packages/shared/src/img/box-select.ts
@ -1,6 +1,6 @@
 import assert from 'node:assert';
 import type Jimp from 'jimp';
-import type { BaseElement } from '../types';
+import type { BaseElement, Rect } from '../types';
 import getJimp from './get-jimp';
 import { bufferFromBase64, imageInfoOfBase64 } from './index';

@ -21,11 +21,17 @@ const loadFonts = async () => {
  }
 };

+interface ElementForOverlay {
+  rect: Rect;
+  indexId?: number;
+}
+
 const createSvgOverlay = async (
-  elements: Array<BaseElement>,
+  elements: Array<ElementForOverlay>,
  imageWidth: number,
  imageHeight: number,
  boxPadding = 5,
+  borderThickness = 2,
  prompt?: string,
 ): Promise<Jimp> => {
  const Jimp = await getJimp();
@ -86,17 +92,21 @@ const createSvgOverlay = async (
    const color = colors[index % colors.length];

    // Add 5px padding to the rect
+    const paddedLeft = Math.max(0, element.rect.left - boxPadding);
+    const paddedTop = Math.max(0, element.rect.top - boxPadding);
+    const paddedWidth = Math.min(
+      imageWidth - paddedLeft,
+      element.rect.width + boxPadding * 2,
+    );
+    const paddedHeight = Math.min(
+      imageHeight - paddedTop,
+      element.rect.height + boxPadding * 2,
+    );
    const paddedRect = {
-      left: Math.max(0, element.rect.left - boxPadding),
-      top: Math.max(0, element.rect.top - boxPadding),
-      width: Math.min(
-        imageWidth - element.rect.left,
-        element.rect.width + boxPadding * 2,
-      ),
-      height: Math.min(
-        imageHeight - element.rect.top,
-        element.rect.height + boxPadding * 2,
-      ),
+      left: paddedLeft,
+      top: paddedTop,
+      width: paddedWidth,
+      height: paddedHeight,
    };

    // Draw rectangle
@ -107,10 +117,12 @@ const createSvgOverlay = async (
      paddedRect.height,
      (x: number, y: number, idx: number): void => {
        if (
-          x === paddedRect.left ||
-          x === paddedRect.left + paddedRect.width - 1 ||
-          y === paddedRect.top ||
-          y === paddedRect.top + paddedRect.height - 1
+          (x >= paddedRect.left && x < paddedRect.left + borderThickness) || // Left border
+          (x <= paddedRect.left + paddedRect.width - 1 &&
+            x > paddedRect.left + paddedRect.width - borderThickness) || // Right border
+          (y >= paddedRect.top && y < paddedRect.top + borderThickness) || // Top border
+          (y <= paddedRect.top + paddedRect.height - 1 &&
+            y > paddedRect.top + paddedRect.height - borderThickness) // Bottom border
        ) {
          image.bitmap.data[idx + 0] = (color.rect >> 24) & 0xff; // R
          image.bitmap.data[idx + 1] = (color.rect >> 16) & 0xff; // G
@ -234,9 +246,10 @@ const createSvgOverlay = async (

 export const compositeElementInfoImg = async (options: {
  inputImgBase64: string;
-  elementsPositionInfo: Array<BaseElement>;
+  elementsPositionInfo: Array<ElementForOverlay>;
  size?: { width: number; height: number };
  annotationPadding?: number;
+  borderThickness?: number;
  prompt?: string;
 }) => {
  assert(options.inputImgBase64, 'inputImgBase64 is required');
@ -280,6 +293,7 @@ export const compositeElementInfoImg = async (options: {
        width,
        height,
        options.annotationPadding,
+        options.borderThickness,
        prompt,
      );
      const svgImage = await Jimp.read(svgOverlay);
--- a/packages/visualizer/src/component/blackboard.tsx
+++ b/packages/visualizer/src/component/blackboard.tsx
@ -12,10 +12,26 @@ import { useBlackboardPreference } from './store/store';

 const itemFillAlpha = 0.4;
 const highlightAlpha = 0.4;
+const pointRadius = 10;
 const noop = () => {
  // noop
 };

+export const pointMarkForItem = (
+  point: [number, number],
+  type: 'highlightPoint',
+) => {
+  const [x, y] = point;
+  const themeColor = highlightColorForType('element');
+
+  const graphics = new PIXI.Graphics();
+  // draw a circle
+  graphics.beginFill(themeColor, itemFillAlpha);
+  graphics.drawCircle(x, y, pointRadius);
+  graphics.endFill();
+  return graphics;
+};
+
 export const rectMarkForItem = (
  rect: Rect,
  name: string,
@ -49,6 +65,9 @@ export const rectMarkForItem = (
  graphics.filters = [dropShadowFilter];

  const nameFontSize = 18;
+  if (!name) {
+    return [graphics];
+  }
  const texts = new PIXI.Text(name, {
    fontSize: nameFontSize,
    fill: 0x0,
@ -62,11 +81,14 @@ export const Blackboard = (props: {
  uiContext: UIContext;
  highlightElements?: BaseElement[];
  highlightRect?: Rect;
+  highlightPoints?: [number, number][];
  hideController?: boolean;
+  onCanvasClick?: (position: [number, number]) => void;
 }): JSX.Element => {
  const highlightElements: BaseElement[] = props.highlightElements || [];
  const highlightIds = highlightElements.map((e) => e.id);
  const highlightRect = props.highlightRect;
+  const highlightPoints = props.highlightPoints;

  const context = props.uiContext!;
  const { size, screenshotBase64 } = context;
@ -128,6 +150,28 @@ export const Blackboard = (props: {
    };
  }, [app, screenWidth, screenHeight]);

+  useEffect(() => {
+    if (!appInitialed) {
+      return;
+    }
+
+    // Enable interaction on the stage and all its children
+    app.stage.eventMode = 'static';
+    app.stage.hitArea = new PIXI.Rectangle(0, 0, screenWidth, screenHeight);
+
+    const clickHandler = (event: PIXI.FederatedPointerEvent) => {
+      console.log('pixi click', event);
+      const { x, y } = event.data.global;
+      props.onCanvasClick?.([Math.round(x), Math.round(y)]);
+    };
+
+    app.stage.on('click', clickHandler);
+
+    return () => {
+      app?.stage?.off('click');
+    };
+  }, [appInitialed, props.onCanvasClick, screenWidth, screenHeight]);
+
  // draw all texts on PIXI app
  useEffect(() => {
    if (!appInitialed) {
@ -144,13 +188,18 @@ export const Blackboard = (props: {
      backgroundSprite.y = 0;
      backgroundSprite.width = screenWidth;
      backgroundSprite.height = screenHeight;
+
+      // Ensure the background doesn't block interactivity
+      backgroundSprite.eventMode = 'passive';
+
      app.stage.addChildAt(backgroundSprite, 0);
+      pixiBgRef.current = backgroundSprite;
    };
    img.onerror = (e) => {
      console.error('load screenshot failed', e);
    };
    img.src = screenshotBase64;
-  }, [app.stage, appInitialed]);
+  }, [app.stage, appInitialed, screenWidth, screenHeight]);

  const { highlightElementRects } = useMemo(() => {
    const highlightElementRects: Rect[] = [];
@ -158,8 +207,11 @@ export const Blackboard = (props: {
    highlightContainer.removeChildren();
    elementMarkContainer.removeChildren();

+    // Make containers interactive but not blocking events
+    highlightContainer.eventMode = 'passive';
+    elementMarkContainer.eventMode = 'passive';
+
    if (highlightRect) {
-      console.log('highlightRect', highlightRect);
      const [graphics] = rectMarkForItem(
        highlightRect,
        'Search Area',
@ -176,6 +228,13 @@ export const Blackboard = (props: {
      });
    }

+    if (highlightPoints?.length) {
+      highlightPoints.forEach((point) => {
+        const graphics = pointMarkForItem(point, 'highlightPoint');
+        highlightContainer.addChild(graphics);
+      });
+    }
+
    // element rects
    context.content.forEach((element) => {
      const { rect, content, id } = element;
@ -200,6 +259,7 @@ export const Blackboard = (props: {
    context.content,
    hoverElement,
    highlightRect,
+    highlightPoints,
    // bgVisible,
    // elementsVisible,
  ]);
--- a/packages/visualizer/src/component/describer.less
+++ b/packages/visualizer/src/component/describer.less
@ -0,0 +1,24 @@
+.image-describer {
+  position: relative;
+
+  .describe-text {
+    box-sizing: border-box;
+    position: absolute;
+    background: #000;
+    width: 100%;
+    height: 30px;
+    left: 0;
+    bottom: 0;
+    color: #FFF;
+    font-size: 12px;
+    padding: 10px;
+  }
+
+  .describe-text.success {
+    background: #047704;
+  }
+
+  .describe-text.error {
+    background: #870707;
+  }
+}
--- a/packages/visualizer/src/component/describer.tsx
+++ b/packages/visualizer/src/component/describer.tsx
@ -0,0 +1,150 @@
+'use client';
+
+import type {
+  AgentDescribeElementAtPointResult,
+  Rect,
+  UIContext,
+} from '@midscene/core';
+import type { WebUIContext } from '@midscene/web/utils';
+import { useEffect, useRef, useState } from 'react';
+import { useStaticPageAgent } from './playground/useStaticPageAgent';
+import './describer.less';
+import { Panel, PanelGroup, PanelResizeHandle } from 'react-resizable-panels';
+import { Blackboard } from './blackboard';
+import { PlaygroundResultView } from './playground/PlaygroundResult';
+
+export const Describer = (props: { uiContext: UIContext }): JSX.Element => {
+  const { uiContext } = props;
+  const image = uiContext.screenshotBase64;
+  const canvasRef = useRef<HTMLCanvasElement>(null);
+
+  const [highlightPoints, setHighlightPoints] = useState<[number, number][]>(
+    [],
+  );
+  const [highlightRect, setHighlightRect] = useState<Rect | undefined>();
+
+  const [error, setError] = useState<string | undefined>();
+  const [loading, setLoading] = useState(false);
+  const [result, setResult] = useState<
+    AgentDescribeElementAtPointResult | undefined
+  >();
+
+  const agent = useStaticPageAgent(uiContext as WebUIContext);
+
+  useEffect(() => {
+    const canvas = canvasRef.current;
+    if (!canvas || !image) return;
+
+    const ctx = canvas.getContext('2d');
+    if (!ctx) return;
+
+    const img = new Image();
+    img.onload = () => {
+      // Set canvas dimensions to match the image
+      canvas.width = img.width;
+      canvas.height = img.height;
+
+      // Draw the image on the canvas
+      ctx.drawImage(img, 0, 0);
+    };
+
+    // Set the image source (base64 data)
+    img.src = image;
+  }, [image]);
+
+  const handleClick = async (position: [number, number]) => {
+    if (!agent) {
+      console.error('agent is not initialized');
+      return;
+    }
+
+    setLoading(true);
+    setError(undefined);
+    setResult(undefined);
+    setHighlightPoints([]);
+    setHighlightRect(undefined);
+
+    try {
+      const userLocation: [number, number] = [position[0], position[1]];
+      setHighlightPoints([userLocation]);
+
+      const result = await agent?.describeElementAtPoint(userLocation);
+      console.log('describe result', result);
+      setResult(result);
+      if (result.verifyResult?.rect) {
+        setHighlightRect(result.verifyResult.rect);
+      }
+    } catch (error: any) {
+      setError(error.message);
+    } finally {
+      setLoading(false);
+    }
+  };
+
+  let resultText = '';
+  if (error) {
+    resultText = error;
+  } else if (result && !result.verifyResult?.pass) {
+    resultText = `Locate failed with prompt: ${result.prompt}`;
+  } else if (result) {
+    if (result.deepThink) {
+      resultText = `Deep think: ${result.prompt}`;
+    } else {
+      resultText = result.prompt;
+    }
+  } else if (loading) {
+    resultText = 'Loading...';
+  }
+
+  return (
+    <div className="image-describer">
+      <PanelGroup autoSaveId="describer-layout" direction="horizontal">
+        <Panel
+          defaultSize={32}
+          maxSize={60}
+          minSize={20}
+          style={{ paddingRight: '24px' }}
+        >
+          <div className="form-part context-panel">
+            <h3>Screenshot</h3>
+            <div className="form-sub-title">
+              Click on the screenshot, Midscene will help you describe the
+              element at the clicked point.
+            </div>
+            <Blackboard
+              uiContext={{
+                ...uiContext,
+                content: [], // remove all contents
+                tree: {
+                  node: null,
+                  children: [],
+                },
+              }}
+              highlightPoints={highlightPoints}
+              highlightRect={highlightRect}
+              onCanvasClick={handleClick}
+              hideController={true}
+            />
+          </div>
+        </Panel>
+        <PanelResizeHandle className="panel-resize-handle" />
+        <Panel>
+          <PlaygroundResultView
+            result={{
+              result: resultText,
+              error: error || null,
+            }}
+            loading={loading}
+            serverValid={true}
+            serviceMode={'In-Browser'}
+            replayScriptsInfo={null}
+            replayCounter={0}
+            loadingProgressText={''}
+          />
+        </Panel>
+      </PanelGroup>
+    </div>
+  );
+};
+
+export default Describer;
--- a/packages/visualizer/src/component/playground/index.less
+++ b/packages/visualizer/src/component/playground/index.less
@ -6,7 +6,8 @@ body {
  font-size: 14px;
 }

-.playground-container {
+.playground-container,
+.image-describer {
  width: 100%;
  height: 100%;

@ -35,7 +36,6 @@ body {
    overflow-y: auto !important;

    .ant-form {
-
      flex-grow: 1;
      display: flex;
      flex-direction: column;
@ -50,6 +50,11 @@ body {
      font-size: 18px;
    }

+    .form-sub-title {
+      margin-bottom: 12px;
+      font-size: 14px;
+    }
+
    .switch-btn-wrapper {
      .ant-btn {
        padding: 0;
--- a/packages/visualizer/src/component/playground/playground-types.ts
+++ b/packages/visualizer/src/component/playground/playground-types.ts
@ -6,8 +6,8 @@ import type { WebUIContext } from '@midscene/web/utils';
 // result type
 export interface PlaygroundResult {
  result: any;
-  dump: GroupedActionDump | null;
-  reportHTML: string | null;
+  dump?: GroupedActionDump | null;
+  reportHTML?: string | null;
  error: string | null;
 }

--- a/packages/visualizer/src/index.tsx
+++ b/packages/visualizer/src/index.tsx
@ -28,6 +28,7 @@ export { PromptInput } from './component/playground/PromptInput';
 export { Player } from './component/player';
 export { Blackboard } from './component/blackboard';
 export { GithubStar } from './component/github-star';
+export { Describer } from './component/describer';

 // Export playground utilities
 export {
--- a/packages/web-integration/src/bridge-mode/page-browser-side.ts
+++ b/packages/web-integration/src/bridge-mode/page-browser-side.ts
@ -144,7 +144,6 @@ export class ExtensionBridgePageBrowserSide extends ChromeExtensionProxyPage {
    },
  ) {
    const tabs = await chrome.tabs.query({ active: true, currentWindow: true });
-    console.log('current tab', tabs);
    const tabId = tabs[0]?.id;
    assert(tabId, 'failed to get tabId');

--- a/packages/web-integration/src/common/agent.ts
+++ b/packages/web-integration/src/common/agent.ts
@ -1,6 +1,7 @@
 import type { WebPage } from '@/common/page';
 import {
  type AgentAssertOpt,
+  type AgentDescribeElementAtPointResult,
  type AgentWaitForOpt,
  type DetailedLocateParam,
  type ExecutionDump,
@ -11,9 +12,12 @@ import {
  type InsightAction,
  type LocateOption,
  type LocateResultElement,
+  type LocateValidatorResult,
+  type LocatorValidatorOption,
  type MidsceneYamlScript,
  type OnTaskStartTip,
  type PlanningActionParamScroll,
+  type Rect,
 } from '@midscene/core';

 import yaml from 'js-yaml';
@ -49,6 +53,18 @@ import { type WebUIContext, parseContextFromWebPage } from './utils';

 const debug = getDebug('web-integration');

+const distanceOfTwoPoints = (p1: [number, number], p2: [number, number]) => {
+  const [x1, y1] = p1;
+  const [x2, y2] = p2;
+  return Math.round(Math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2));
+};
+
+const includedInRect = (point: [number, number], rect: Rect) => {
+  const [x, y] = point;
+  const { left, top, width, height } = rect;
+  return x >= left && x <= left + width && y >= top && y <= top + height;
+};
+
 export interface PageAgentOpt {
  forceSameTabNavigation?: boolean /* if limit the new tab to the current page, default true */;
  testId?: string;
@ -404,6 +420,88 @@ export class PageAgent<PageType extends WebPage = WebPage> {
    return output;
  }

+  async describeElementAtPoint(
+    center: [number, number],
+    opt?: {
+      verifyPrompt?: boolean;
+      retryLimit?: number;
+      deepThink?: boolean;
+    } & LocatorValidatorOption,
+  ): Promise<AgentDescribeElementAtPointResult> {
+    const { verifyPrompt = true, retryLimit = 3 } = opt || {};
+
+    let success = false;
+    let retryCount = 0;
+    let resultPrompt = '';
+    let deepThink = opt?.deepThink || false;
+    let verifyResult: LocateValidatorResult | undefined;
+
+    while (!success && retryCount < retryLimit) {
+      if (retryCount >= 2) {
+        deepThink = true;
+      }
+      debug(
+        'aiDescribe',
+        center,
+        'verifyPrompt',
+        verifyPrompt,
+        'retryCount',
+        retryCount,
+        'deepThink',
+        deepThink,
+      );
+      const text = await this.insight.describe(center, { deepThink });
+      debug('aiDescribe text', text);
+      assert(text.description, `failed to describe element at [${center}]`);
+      resultPrompt = text.description;
+
+      verifyResult = await this.verifyLocator(
+        resultPrompt,
+        deepThink ? { deepThink: true } : undefined,
+        center,
+        opt,
+      );
+      if (verifyResult.pass) {
+        success = true;
+      } else {
+        retryCount++;
+      }
+    }
+
+    return {
+      prompt: resultPrompt,
+      deepThink,
+      verifyResult,
+    };
+  }
+
+  async verifyLocator(
+    prompt: string,
+    locateOpt: LocateOption | undefined,
+    expectCenter: [number, number],
+    verifyLocateOption?: LocatorValidatorOption,
+  ): Promise<LocateValidatorResult> {
+    debug('verifyLocator', prompt, locateOpt, expectCenter, verifyLocateOption);
+
+    const { center: verifyCenter, rect: verifyRect } = await this.aiLocate(
+      prompt,
+      locateOpt,
+    );
+    const distance = distanceOfTwoPoints(expectCenter, verifyCenter);
+    const included = includedInRect(expectCenter, verifyRect);
+    const pass =
+      distance <= (verifyLocateOption?.centerDistanceThreshold || 20) ||
+      included;
+    const verifyResult = {
+      pass,
+      rect: verifyRect,
+      center: verifyCenter,
+      centerDistance: distance,
+    };
+    debug('aiDescribe verifyResult', verifyResult);
+    return verifyResult;
+  }
+
  async aiLocate(prompt: string, opt?: LocateOption) {
    const detailedLocateParam = this.buildDetailedLocateParam(prompt, opt);
    const plans = buildPlans('Locate', detailedLocateParam);
--- a/packages/web-integration/src/common/tasks.ts
+++ b/packages/web-integration/src/common/tasks.ts
@ -217,6 +217,7 @@ export class PageTaskExecutor {
            const shotTime = Date.now();
            const pageContext = await this.insight.contextRetrieverFn('locate');
            task.pageContext = pageContext;
+
            const recordItem: ExecutionRecorderItem = {
              type: 'screenshot',
              ts: shotTime,
--- a/packages/web-integration/tests/ai/web/puppeteer/showcase.test.ts
+++ b/packages/web-integration/tests/ai/web/puppeteer/showcase.test.ts
@ -159,6 +159,32 @@ describe(
      3 * 60 * 1000,
    );

+    it('element describer', async () => {
+      const { originPage, reset } = await launchPage('https://www.taobao.com/');
+      resetFn = reset;
+      const agent = new PuppeteerAgent(originPage);
+
+      const { center } = await agent.aiLocate('the search bar');
+      const describeResult = await agent.describeElementAtPoint(center);
+      expect(describeResult.verifyResult?.pass).toBe(true);
+      expect(describeResult.verifyResult?.rect).toBeTruthy();
+      expect(describeResult.verifyResult?.center).toBeTruthy();
+    });
+
+    it('element describer - deep think', async () => {
+      const { originPage, reset } = await launchPage('https://www.taobao.com/');
+      resetFn = reset;
+      const agent = new PuppeteerAgent(originPage);
+
+      const { center } = await agent.aiLocate('the "search" button');
+      const describeResult = await agent.describeElementAtPoint(center, {
+        deepThink: true,
+      });
+      expect(describeResult.verifyResult?.pass).toBe(true);
+      expect(describeResult.verifyResult?.rect).toBeTruthy();
+      expect(describeResult.verifyResult?.center).toBeTruthy();
+    });
+
    it('scroll', async () => {
      const htmlPath = path.join(__dirname, 'scroll.html');
      const { originPage, reset } = await launchPage(`file://${htmlPath}`);