feat: support android device button (#567)

2025-12-27 15:10:20 +00:00 · 2025-04-22 23:55:42 +08:00 · 2025-04-22 23:55:42 +08:00 · 38c50a23c3
commit 38c50a23c3
parent 5c6e3e3b66
14 changed files with 336 additions and 59 deletions
--- a/packages/android/src/page/index.ts
+++ b/packages/android/src/page/index.ts
@ -2,6 +2,7 @@ import assert from 'node:assert';
 import fs from 'node:fs';
 import path from 'node:path';
 import type { Point, Size } from '@midscene/core';
+import type { PageType } from '@midscene/core';
 import { getTmpFile } from '@midscene/core/utils';
 import type { ElementInfo } from '@midscene/shared/extractor';
 import { isValidPNGImageBuffer, resizeImg } from '@midscene/shared/img';
@ -19,7 +20,7 @@ export class AndroidDevice implements AndroidDevicePage {
  private deviceRatio = 1;
  private adb: ADB | null = null;
  private connectingAdb: Promise<ADB> | null = null;
-  pageType = 'android';
+  pageType: PageType = 'android';
  uri: string | undefined;

  constructor(deviceId: string) {
@ -631,4 +632,19 @@ ${Object.keys(size)
      console.error('Error during cleanup:', error);
    }
  }
+
+  async back(): Promise<void> {
+    const adb = await this.getAdb();
+    await adb.shell('input keyevent 4');
+  }
+
+  async home(): Promise<void> {
+    const adb = await this.getAdb();
+    await adb.shell('input keyevent 3');
+  }
+
+  async recentApps(): Promise<void> {
+    const adb = await this.getAdb();
+    await adb.shell('input keyevent 82');
+  }
 }
--- a/packages/android/tests/ai/travel.test.ts
+++ b/packages/android/tests/ai/travel.test.ts
@ -0,0 +1,42 @@
+import { sleep } from '@midscene/core/utils';
+import { beforeAll, describe, expect, it, vi } from 'vitest';
+import { AndroidAgent, AndroidDevice, getConnectedDevices } from '../../src';
+
+vi.setConfig({
+  testTimeout: 240 * 1000,
+});
+
+describe('Test todo list', () => {
+  let agent: AndroidAgent;
+
+  beforeAll(async () => {
+    const devices = await getConnectedDevices();
+    const page = new AndroidDevice(devices[0].udid);
+    agent = new AndroidAgent(page, {
+      aiActionContext:
+        'If any location, permission, user agreement, etc. popup, click agree. If login page pops up, close it.',
+    });
+    await page.connect();
+  });
+
+  it(
+    'travel',
+    async () => {
+      await agent.aiAction('open Weather app');
+      await agent.aiAction(
+        'click plus create button on the left top corner, enter search page, search "Hangzhou"',
+      );
+      await agent.aiAction(
+        'if there is one day without rain on screen, click Android System Button "Home" to return to Home Screen',
+      );
+      await agent.aiAction(
+        'open Maps app, search "West Lake", click the search button',
+      );
+      await agent.aiAction(
+        'click "Directions" button, enter the route planning page',
+      );
+      await agent.aiAction('click "Start" button to start navigation');
+    },
+    720 * 1000,
+  );
+});
--- a/packages/core/package.json
+++ b/packages/core/package.json
@ -30,7 +30,8 @@
    "build:watch": "modern build -w",
    "new": "modern new",
    "upgrade": "modern upgrade",
-    "test": "vitest --run -u",
+    "test": "vitest --run",
+    "test:u": "vitest --run -u",
    "test:ai": "AITEST=true npm run test",
    "computer": "TEST_COMPUTER=true npm run test:ai -- tests/ai/evaluate/computer.test.ts",
    "test:parse-action": "npm run test:ai -- tests/ai/parse-action.test.ts",
--- a/packages/core/src/ai-model/llm-planning.ts
+++ b/packages/core/src/ai-model/llm-planning.ts
@ -1,5 +1,5 @@
 import { vlLocateMode } from '@/env';
-import type { PlanningAIResponse, UIContext } from '@/types';
+import type { PageType, PlanningAIResponse, UIContext } from '@/types';
 import { paddingToMatchBlockByBase64 } from '@midscene/shared/img';
 import { assert } from '@midscene/shared/utils';
 import {
@ -19,23 +19,29 @@ import { describeUserPage } from './prompt/util';
 export async function plan(
  userInstruction: string,
  opts: {
+    context: UIContext;
+    pageType: PageType;
+    callAI?: typeof callAiFn<PlanningAIResponse>;
    log?: string;
    actionContext?: string;
-    context: UIContext;
-    callAI?: typeof callAiFn<PlanningAIResponse>;
  },
 ): Promise<PlanningAIResponse> {
  const { callAI, context } = opts || {};
  const { screenshotBase64, screenshotBase64WithElementMarker, size } = context;
  const { description: pageDescription } = await describeUserPage(context);

-  const systemPrompt = await systemPromptToTaskPlanning(vlLocateMode());
+  const systemPrompt = await systemPromptToTaskPlanning({
+    pageType: opts.pageType,
+    vlMode: vlLocateMode(),
+  });
  const taskBackgroundContextText = generateTaskBackgroundContext(
    userInstruction,
    opts.log,
    opts.actionContext,
  );
-  const userInstructionPrompt = await automationUserPrompt().format({
+  const userInstructionPrompt = await automationUserPrompt(
+    vlLocateMode(),
+  ).format({
    pageDescription,
    taskBackgroundContext: taskBackgroundContextText,
  });
--- a/packages/core/src/ai-model/prompt/llm-planning.ts
+++ b/packages/core/src/ai-model/prompt/llm-planning.ts
@ -1,9 +1,9 @@
-import { vlLocateMode } from '@/env';
+import type { vlLocateMode } from '@/env';
+import type { PageType } from '@/types';
 import { PromptTemplate } from '@langchain/core/prompts';
 import type { ResponseFormatJSONSchema } from 'openai/resources';
 import { bboxDescription } from './common';
 import { samplePageDescription } from './util';
-
 // Note: put the log field first to trigger the CoT
 const vlCoTLog = `"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. `;
 const vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
@ -14,14 +14,18 @@ const commonOutputFields = `"error"?: string, // Error messages about unexpected
 const vlLocateParam =
  'locate: {bbox: [number, number, number, number], prompt: string }';

-const systemTemplateOfVLPlanning = (
-  vlMode: ReturnType<typeof vlLocateMode>,
-) => `
+const systemTemplateOfVLPlanning = ({
+  pageType,
+  vlMode,
+}: {
+  pageType: PageType;
+  vlMode: ReturnType<typeof vlLocateMode>;
+}) => `
 Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. 

 Restriction:
 - Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll.
+- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll${pageType === 'android' ? ', AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton.' : '.'}
 - Don't repeat actions in the previous logs.
 - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.

@ -31,6 +35,13 @@ Supporting actions:
 - Input: { type: "Input", ${vlLocateParam}, param: { value: string } } // \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. 
 - KeyboardPress: { type: "KeyboardPress", param: { value: string } }
 - Scroll: { type: "Scroll", ${vlLocateParam} | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
+${
+  pageType === 'android'
+    ? `- AndroidBackButton: { type: "AndroidBackButton", param: {} }
+- AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
+- AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }`
+    : ''
+}

 Field description:
 * The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
@ -67,7 +78,7 @@ this and output the JSON:
 `;

 const llmLocateParam = `locate: {{"id": string, "prompt": string}} | null`;
-const systemTemplateOfLLM = `
+const systemTemplateOfLLM = ({ pageType }: { pageType: PageType }) => `
 ## Role

 You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
@ -81,7 +92,7 @@ You are a versatile professional in software UI automation. Your outstanding con
 ## Workflow

 1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
-2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep). The "About the action" section below will give you more details.
+2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ${pageType === 'android' ? '/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton' : ''}). The "About the action" section below will give you more details.
 3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
 4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
 5. Consider whether the user's instruction will be accomplished after all the actions
@ -127,11 +138,22 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
    }}
    * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field. 
    * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
+  * {{ param: {{ button: 'Back' | 'Home' | 'RecentApp' }} }}
 - type: 'ExpectedFalsyCondition'
  * {{ param: {{ reason: string }} }}
  * use this action when the conditional statement talked about in the instruction is falsy.
 - type: 'Sleep'
  * {{ param: {{ timeMs: number }} }}
+${
+  pageType === 'android'
+    ? `- type: 'AndroidBackButton', trigger the system "back" operation on Android devices
+  * {{ param: {{}} }}
+- type: 'AndroidHomeButton', trigger the system "home" operation on Android devices
+  * {{ param: {{}} }}
+- type: 'AndroidRecentAppsButton', trigger the system "recent apps" operation on Android devices
+  * {{ param: {{}} }}`
+    : ''
+}
 `;

 const outputTemplate = `
@ -208,15 +230,19 @@ Reason:
 * Since the option button is not shown in the screenshot, there are still more actions to be done, so the \`more_actions_needed_by_instruction\` field should be true
 `;

-export async function systemPromptToTaskPlanning(
-  vlMode: ReturnType<typeof vlLocateMode>,
-) {
+export async function systemPromptToTaskPlanning({
+  pageType,
+  vlMode,
+}: {
+  pageType: PageType;
+  vlMode: ReturnType<typeof vlLocateMode>;
+}) {
  if (vlMode) {
-    return systemTemplateOfVLPlanning(vlMode);
+    return systemTemplateOfVLPlanning({ pageType, vlMode });
  }

  const promptTemplate = new PromptTemplate({
-    template: `${systemTemplateOfLLM}\n\n${outputTemplate}`,
+    template: `${systemTemplateOfLLM({ pageType })}\n\n${outputTemplate}`,
    inputVariables: ['pageDescription'],
  });

@ -249,7 +275,7 @@ export const planSchema: ResponseFormatJSONSchema = {
              type: {
                type: 'string',
                description:
-                  'Type of action, one of "Tap", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep"',
+                  'Type of action, one of "Tap", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton"',
              },
              param: {
                anyOf: [
@ -282,6 +308,12 @@ export const planSchema: ResponseFormatJSONSchema = {
                    required: ['reason'],
                    additionalProperties: false,
                  },
+                  {
+                    type: 'object',
+                    properties: { button: { type: 'string' } },
+                    required: ['button'],
+                    additionalProperties: false,
+                  },
                ],
                description:
                  'Parameter of the action, can be null ONLY when the type field is Tap or Hover',
@ -365,8 +397,10 @@ Here is the user's instruction:
 `;
 };

-export const automationUserPrompt = () => {
-  if (vlLocateMode()) {
+export const automationUserPrompt = (
+  vlMode: ReturnType<typeof vlLocateMode>,
+) => {
+  if (vlMode) {
    return new PromptTemplate({
      template: '{taskBackgroundContext}',
      inputVariables: ['taskBackgroundContext'],
--- a/packages/core/src/ai-model/ui-tars-planning.ts
+++ b/packages/core/src/ai-model/ui-tars-planning.ts
@ -13,7 +13,10 @@ type ActionType =
  | 'hotkey'
  | 'finished'
  | 'scroll'
-  | 'wait';
+  | 'wait'
+  | 'androidBackButton'
+  | 'androidHomeButton'
+  | 'androidRecentAppsButton';

 const bboxSize = 10;
 const pointToBbox = (
@ -145,6 +148,25 @@ export async function vlmPlanning(options: {
        locate: null,
        thought: action.thought || '',
      });
+    } else if (action.action_type === 'androidBackButton') {
+      transformActions.push({
+        type: 'AndroidBackButton',
+        param: {},
+        locate: null,
+        thought: action.thought || '',
+      });
+    } else if (action.action_type === 'androidHomeButton') {
+      transformActions.push({
+        type: 'AndroidHomeButton',
+        param: {},
+        locate: null,
+        thought: action.thought || '',
+      });
+    } else if (action.action_type === 'androidRecentAppsButton') {
+      transformActions.push({
+        type: 'AndroidRecentAppsButton',
+        param: {},
+      });
    }
  });

--- a/packages/core/src/types.ts
+++ b/packages/core/src/types.ts
@ -271,7 +271,11 @@ export interface PlanningAction<ParamType = any> {
    | 'Assert'
    | 'AssertWithoutThrow'
    | 'Sleep'
-    | 'Finished';
+    | 'Finished'
+    | 'AndroidBackButton'
+    | 'AndroidHomeButton'
+    | 'AndroidRecentAppsButton';
+
  param: ParamType;
  locate?: PlanningLocateParam | null;
 }
@ -518,3 +522,10 @@ export interface GroupedActionDump {
  groupDescription?: string;
  executions: ExecutionDump[];
 }
+
+export type PageType =
+  | 'puppeteer'
+  | 'playwright'
+  | 'static'
+  | 'chrome-extension-proxy'
+  | 'android';
--- a/packages/core/tests/unit-test/env.test.ts
+++ b/packages/core/tests/unit-test/env.test.ts
@ -100,9 +100,9 @@ describe('env', () => {
  });

  describe('overrideAIConfig', () => {
-    it('should extend global config by default', () => {
+    it('should extend global config when extendMode is true', () => {
      overrideAIConfig({ [MIDSCENE_MODEL_NAME]: 'model-1' });
-      overrideAIConfig({ [MIDSCENE_USE_QWEN_VL]: 'true' });
+      overrideAIConfig({ [MIDSCENE_USE_QWEN_VL]: 'true' }, true);

      expect(getAIConfig(MIDSCENE_MODEL_NAME)).toBe('model-1');
      expect(getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL)).toBe(true);
--- a/packages/core/tests/unit-test/prompt/snapshots/prompt.test.ts.snap
+++ b/packages/core/tests/unit-test/prompt/snapshots/prompt.test.ts.snap
@ -319,6 +319,18 @@ exports[`system prompts > planning - 4o - response format 1`] = `
                    ],
                    "type": "object",
                  },
+                  {
+                    "additionalProperties": false,
+                    "properties": {
+                      "button": {
+                        "type": "string",
+                      },
+                    },
+                    "required": [
+                      "button",
+                    ],
+                    "type": "object",
+                  },
                ],
                "description": "Parameter of the action, can be null ONLY when the type field is Tap or Hover",
              },
@ -327,7 +339,7 @@ exports[`system prompts > planning - 4o - response format 1`] = `
                "type": "string",
              },
              "type": {
-                "description": "Type of action, one of "Tap", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep"",
+                "description": "Type of action, one of "Tap", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton"",
                "type": "string",
              },
            },
@ -388,7 +400,7 @@ You are a versatile professional in software UI automation. Your outstanding con
 ## Workflow

 1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
-2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep). The "About the action" section below will give you more details.
+2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ). The "About the action" section below will give you more details.
 3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
 4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
 5. Consider whether the user's instruction will be accomplished after all the actions
@ -434,6 +446,7 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
    }
    * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field. 
    * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
+  * { param: { button: 'Back' | 'Home' | 'RecentApp' } }
 - type: 'ExpectedFalsyCondition'
  * { param: { reason: string } }
  * use this action when the conditional statement talked about in the instruction is falsy.
@ -442,6 +455,7 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:



+
 ## Output JSON Format:

 The JSON format is as follows:
@ -517,6 +531,62 @@ Reason:
 "
 `;

+exports[`system prompts > planning - android 1`] = `
+"
+Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. 
+
+Restriction:
+- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
+- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll, AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton.
+- Don't repeat actions in the previous logs.
+- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing 2d bounding box as [xmin, ymin, xmax, ymax].
+
+Supporting actions:
+- Tap: { type: "Tap", locate: {bbox: [number, number, number, number], prompt: string } }
+- Hover: { type: "Hover", locate: {bbox: [number, number, number, number], prompt: string } }
+- Input: { type: "Input", locate: {bbox: [number, number, number, number], prompt: string }, param: { value: string } } // \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. 
+- KeyboardPress: { type: "KeyboardPress", param: { value: string } }
+- Scroll: { type: "Scroll", locate: {bbox: [number, number, number, number], prompt: string } | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
+- AndroidBackButton: { type: "AndroidBackButton", param: {} }
+- AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
+- AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
+
+Field description:
+* The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
+
+Return in JSON format:
+{
+  "what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. 
+  "log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.
+  "error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
+  "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.
+  "action": 
+    {
+      // one of the supporting actions
+    } | null,
+  ,
+  "sleep"?: number, // The sleep time after the action, in milliseconds.
+}
+
+For example, when the instruction is "click 'Confirm' button, and click 'Yes' in popup" and the log is "I will use action Tap to click 'Confirm' button", by viewing the screenshot and previous logs, you should consider: We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup.
+
+this and output the JSON:
+
+{
+  "what_the_user_wants_to_do_next_by_instruction": "We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup",
+  "log": "I will use action Tap to click 'Yes' in popup",
+  "more_actions_needed_by_instruction": false,
+  "action": {
+    "type": "Tap",
+    "locate": {
+      "bbox": [100, 100, 200, 200],
+      "prompt": "The 'Yes' button in popup"
+    }
+  }
+}
+"
+`;
+
 exports[`system prompts > planning - background context 1`] = `
 "
 Here is the user's instruction:
@ -554,6 +624,7 @@ Supporting actions:
 - KeyboardPress: { type: "KeyboardPress", param: { value: string } }
 - Scroll: { type: "Scroll", locate: {bbox: [number, number, number, number], prompt: string } | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.

+
 Field description:
 * The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.

@ -607,6 +678,7 @@ Supporting actions:
 - KeyboardPress: { type: "KeyboardPress", param: { value: string } }
 - Scroll: { type: "Scroll", locate: {bbox: [number, number, number, number], prompt: string } | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.

+
 Field description:
 * The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.

@ -653,15 +725,7 @@ THIS IS PAGE DESCRIPTION
 THIS IS BACKGROUND CONTEXT"
 `;

-exports[`system prompts > planning - user prompt - qwen 1`] = `
-"
-pageDescription:
-=====================================
-THIS IS PAGE DESCRIPTION
-=====================================
-
-THIS IS BACKGROUND CONTEXT"
-`;
+exports[`system prompts > planning - user prompt - qwen 1`] = `"THIS IS BACKGROUND CONTEXT"`;

 exports[`system prompts > section locator - gemini 1`] = `
 "
--- a/packages/core/tests/unit-test/prompt/prompt.test.ts
+++ b/packages/core/tests/unit-test/prompt/prompt.test.ts
@ -11,7 +11,10 @@ import { describe, expect, it } from 'vitest';

 describe('system prompts', () => {
  it('planning - 4o', async () => {
-    const prompt = await systemPromptToTaskPlanning(false);
+    const prompt = await systemPromptToTaskPlanning({
+      pageType: 'puppeteer',
+      vlMode: false,
+    });
    expect(prompt).toMatchSnapshot();
  });

@ -21,12 +24,26 @@ describe('system prompts', () => {
  });

  it('planning - qwen', async () => {
-    const prompt = await systemPromptToTaskPlanning('qwen-vl');
+    const prompt = await systemPromptToTaskPlanning({
+      pageType: 'puppeteer',
+      vlMode: 'qwen-vl',
+    });
    expect(prompt).toMatchSnapshot();
  });

  it('planning - gemini', async () => {
-    const prompt = await systemPromptToTaskPlanning('gemini');
+    const prompt = await systemPromptToTaskPlanning({
+      pageType: 'puppeteer',
+      vlMode: 'gemini',
+    });
+    expect(prompt).toMatchSnapshot();
+  });
+
+  it('planning - android', async () => {
+    const prompt = await systemPromptToTaskPlanning({
+      pageType: 'android',
+      vlMode: 'qwen-vl',
+    });
    expect(prompt).toMatchSnapshot();
  });

@ -40,7 +57,7 @@ describe('system prompts', () => {
  });

  it('planning - user prompt - 4o', async () => {
-    const prompt = automationUserPrompt();
+    const prompt = automationUserPrompt(false);
    const result = await prompt.format({
      pageDescription: 'THIS IS PAGE DESCRIPTION',
      taskBackgroundContext: 'THIS IS BACKGROUND CONTEXT',
@ -50,8 +67,7 @@ describe('system prompts', () => {
  });

  it('planning - user prompt - qwen', async () => {
-    process.env.MIDSCENE_USE_QWEN_VL = 'true';
-    const prompt = automationUserPrompt();
+    const prompt = automationUserPrompt('qwen-vl');
    const result = await prompt.format({
      pageDescription: 'THIS IS PAGE DESCRIPTION',
      taskBackgroundContext: 'THIS IS BACKGROUND CONTEXT',
--- a/packages/visualizer/src/component/playground/PromptInput.tsx
+++ b/packages/visualizer/src/component/playground/PromptInput.tsx
@ -90,21 +90,29 @@ export const PromptInput: React.FC<PromptInputProps> = ({
        handleRunWithHistory();
        e.preventDefault();
        e.stopPropagation();
+      } else if (e.key === 'Enter') {
+        setTimeout(() => {
+          if (textAreaRef.current) {
+            const textarea = textAreaRef.current.resizableTextArea.textArea;
+            const selectionStart = textarea.selectionStart;
+            const value = textarea.value;
+
+            // check if cursor is at the end of the text
+            const lastNewlineIndex = value.lastIndexOf('\n');
+            const isAtLastLine =
+              lastNewlineIndex === -1 || selectionStart > lastNewlineIndex;
+
+            // only scroll to bottom when cursor is at the end of the text
+            if (isAtLastLine) {
+              textarea.scrollTop = textarea.scrollHeight;
+            }
+          }
+        }, 0);
      }
    },
    [handleRunWithHistory],
  );

-  // handle input change, auto scroll to bottom
-  const handleChange = useCallback(() => {
-    setTimeout(() => {
-      if (textAreaRef.current) {
-        const textarea = textAreaRef.current.resizableTextArea.textArea;
-        textarea.scrollTop = textarea.scrollHeight;
-      }
-    }, 0);
-  }, []);
-
  // Handle settings hover state
  const handleMouseEnter = useCallback(() => {
    setHoveringSettings(true);
@ -206,7 +214,6 @@ export const PromptInput: React.FC<PromptInputProps> = ({
            placeholder={placeholder}
            autoFocus
            onKeyDown={handleKeyDown}
-            onChange={handleChange}
            ref={textAreaRef}
          />
        </Form.Item>
--- a/packages/visualizer/src/component/playground/index.less
+++ b/packages/visualizer/src/component/playground/index.less
@ -192,7 +192,6 @@ body {
      overflow-y: auto;
      white-space: pre-wrap;
      line-height: 21px;
-      max-height: 150px;

      scrollbar-width: thin;

--- a/packages/web-integration/src/common/page.d.ts
+++ b/packages/web-integration/src/common/page.d.ts
@ -1,5 +1,5 @@
 import type { ElementTreeNode } from '@midscene/core';
-import type { Point, Size } from '@midscene/core';
+import type { PageType, Point, Size } from '@midscene/core';
 import type { ElementInfo } from '@midscene/shared/extractor';
 import type { KeyInput } from 'puppeteer';
 import type ChromeExtensionProxyPage from '../chrome-extension/page';
@ -9,7 +9,7 @@ import type { PlaywrightWebPage } from '../playwright';
 import type { PuppeteerWebPage } from '../puppeteer';

 export interface AndroidDevicePage extends AbstractPage {
-  pageType: string;
+  pageType: PageType;
  connect(): Promise<any>;
  launch(uri: string): Promise<any>;

@ -26,6 +26,9 @@ export interface AndroidDevicePage extends AbstractPage {
  scrollDown(distance?: number, startingPoint?: Point): Promise<void>;
  scrollLeft(distance?: number, startingPoint?: Point): Promise<void>;
  scrollRight(distance?: number): Promise<void>;
+  back(): Promise<void>;
+  home(): Promise<void>;
+  recentApps(): Promise<void>;
 }

 export type WebPage =
--- a/packages/web-integration/src/common/tasks.ts
+++ b/packages/web-integration/src/common/tasks.ts
@ -1,4 +1,4 @@
-import type { WebPage } from '@/common/page';
+import type { AndroidDevicePage, WebPage } from '@/common/page';
 import type { PuppeteerWebPage } from '@/puppeteer';
 import {
  type AIUsageInfo,
@ -15,6 +15,7 @@ import {
  type InsightAssertionResponse,
  type InsightDump,
  type InsightExtractParam,
+  type PageType,
  type PlanningAIResponse,
  type PlanningAction,
  type PlanningActionParamAssert,
@ -46,6 +47,10 @@ interface ExecutionResult<OutputType = any> {

 const replanningCountLimit = 10;

+const isAndroidPage = (page: WebPage): page is AndroidDevicePage => {
+  return page.pageType === 'android';
+};
+
 export class PageTaskExecutor {
  page: WebPage;

@ -484,6 +489,56 @@ export class PageTaskExecutor {
          executor: async (param) => {},
        };
        tasks.push(taskActionFinished);
+      } else if (plan.type === 'AndroidHomeButton') {
+        const taskActionAndroidHomeButton: ExecutionTaskActionApply<null> = {
+          type: 'Action',
+          subType: 'AndroidHomeButton',
+          param: null,
+          thought: plan.thought,
+          locate: plan.locate,
+          executor: async (param) => {
+            // Check if the page has back method (Android devices)
+            assert(
+              isAndroidPage(this.page),
+              'Cannot use home button on non-Android devices',
+            );
+            await this.page.home();
+          },
+        };
+        tasks.push(taskActionAndroidHomeButton);
+      } else if (plan.type === 'AndroidBackButton') {
+        const taskActionAndroidBackButton: ExecutionTaskActionApply<null> = {
+          type: 'Action',
+          subType: 'AndroidBackButton',
+          param: null,
+          thought: plan.thought,
+          locate: plan.locate,
+          executor: async (param) => {
+            assert(
+              isAndroidPage(this.page),
+              'Cannot use back button on non-Android devices',
+            );
+            await this.page.back();
+          },
+        };
+        tasks.push(taskActionAndroidBackButton);
+      } else if (plan.type === 'AndroidRecentAppsButton') {
+        const taskActionAndroidRecentAppsButton: ExecutionTaskActionApply<null> =
+          {
+            type: 'Action',
+            subType: 'AndroidRecentAppsButton',
+            param: null,
+            thought: plan.thought,
+            locate: plan.locate,
+            executor: async (param) => {
+              assert(
+                isAndroidPage(this.page),
+                'Cannot use recent apps button on non-Android devices',
+              );
+              await this.page.recentApps();
+            },
+          };
+        tasks.push(taskActionAndroidRecentAppsButton);
      } else {
        throw new Error(`Unknown or unsupported task type: ${plan.type}`);
      }
@ -556,6 +611,7 @@ export class PageTaskExecutor {
            context: pageContext,
            log: param.log,
            actionContext,
+            pageType: this.page.pageType as PageType,
          });
        }