feat: support the if-statement in planning prompt (#184)

This commit is contained in:
yuyutaotao 2024-12-19 10:44:08 +08:00 committed by GitHub
parent 6e54c153de
commit 523adab12f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 481 additions and 248 deletions

View File

@ -1,6 +1,6 @@
# Customize Model Provider
# Customize Model and Provider
Midscene uses the OpenAI SDK as the default AI service. You can customize the configuration using environment variables.
Midscene uses the OpenAI SDK to call AI services. You can customize the configuration using environment variables.
There are the main configs, in which `OPENAI_API_KEY` is required.
@ -29,3 +29,10 @@ export MIDSCENE_OPENAI_INIT_CONFIG_JSON='{"baseURL":"....","defaultHeaders":{"ke
# if you want to use proxy. Midscene uses `socks-proxy-agent` under the hood.
export MIDSCENE_OPENAI_SOCKS_PROXY="socks5://127.0.0.1:1080"
```
Note:
- Always choose a model that supports vision input. Currently, the known supported models are:
- OpenAI: `gpt-4o`
- Aliyun: `qwen-vl-max-latest`
- Please follow the terms of use of each model.

View File

@ -1,6 +1,6 @@
# 自定义模型服务
# 自定义模型服务
Midscene 默认集成了 OpenAI SDK 调用 AI 服务,你可以通过环境变量来自定义配置。
Midscene 默认集成了 OpenAI SDK 调用 AI 服务,你可以通过环境变量来自定义配置。
主要配置项如下,其中 `OPENAI_API_KEY` 是必选项:
@ -29,3 +29,10 @@ export MIDSCENE_OPENAI_INIT_CONFIG_JSON='{"baseURL":"....","defaultHeaders":{"ke
# 可选, 如果你想使用代理。Midscene 使用 `socks-proxy-agent` 作为底层库。
export MIDSCENE_OPENAI_SOCKS_PROXY="socks5://127.0.0.1:1080"
```
说明:
- 务必选择一个支持视觉输入的模型。目前我们已知支持的模型有:
- OpenAI: `gpt-4o`
- 阿里云: `qwen-vl-max-latest`
- 请遵守各项模型的使用条款

View File

@ -81,7 +81,7 @@ export default defineConfig({
link: '/cache',
},
{
text: 'Customize Model Provider',
text: 'Customize Model and Provider',
link: '/model-provider',
},
],
@ -138,7 +138,7 @@ export default defineConfig({
link: '/zh/cache',
},
{
text: '自定义模型服务',
text: '自定义模型服务',
link: '/zh/model-provider',
},
],

View File

@ -17,7 +17,6 @@ export async function plan(
context: UIContext;
callAI?: typeof callAiFn<PlanningAIResponse>;
},
useModel?: 'coze' | 'openAI',
): Promise<PlanningAIResponse> {
const { callAI, context } = opts || {};
const { screenshotBase64, screenshotBase64WithElementMarker } = context;
@ -56,7 +55,7 @@ ${opts.whatHaveDone}
pageDescription:\n
${pageDescription}
\n
Here is what you need to do now:
Here is the instruction:
=====================================
${userPrompt}
=====================================
@ -72,7 +71,6 @@ ${taskBackgroundContext}
const { content, usage } = await call({
msgs,
AIActionType: AIActionType.PLAN,
useModel,
});
const planFromAI = content;

View File

@ -5,15 +5,6 @@ import type {
ChatCompletionSystemMessageParam,
ChatCompletionUserMessageParam,
} from 'openai/resources';
import {
COZE_AI_ACTION_BOT_ID,
COZE_AI_ASSERT_BOT_ID,
COZE_EXTRACT_INFO_BOT_ID,
COZE_INSPECT_ELEMENT_BOT_ID,
callCozeAi,
preferCozeModel,
transformOpenAiArgsToCoze,
} from './coze';
import { callToGetJSONObject, preferOpenAIModel } from './openai';
export type AIArgs = [
@ -31,10 +22,9 @@ export enum AIActionType {
export async function callAiFn<T>(options: {
msgs: AIArgs;
AIActionType: AIActionType;
useModel?: 'openAI' | 'coze';
}): Promise<{ content: T; usage?: AIUsageInfo }> {
const { useModel, msgs, AIActionType: AIActionTypeValue } = options;
if (preferOpenAIModel(useModel)) {
const { msgs, AIActionType: AIActionTypeValue } = options;
if (preferOpenAIModel('openAI')) {
const { content, usage } = await callToGetJSONObject<T>(
msgs,
AIActionTypeValue,
@ -42,29 +32,6 @@ export async function callAiFn<T>(options: {
return { content, usage };
}
// if (preferCozeModel(useModel)) {
// let botId = '';
// switch (AIActionTypeValue) {
// case AIActionType.ASSERT:
// botId = COZE_AI_ASSERT_BOT_ID;
// break;
// case AIActionType.EXTRACT_DATA:
// botId = COZE_EXTRACT_INFO_BOT_ID;
// break;
// case AIActionType.INSPECT_ELEMENT:
// botId = COZE_INSPECT_ELEMENT_BOT_ID;
// break;
// default:
// botId = COZE_AI_ACTION_BOT_ID;
// }
// const cozeMsg = transformOpenAiArgsToCoze(msgs[1]);
// const parseResult = await callCozeAi<T>({
// ...cozeMsg,
// botId,
// });
// return parseResult;
// }
throw Error(
'Cannot find OpenAI config. You should set it before using. https://midscenejs.com/model-provider.html',
);

View File

@ -67,8 +67,7 @@ export async function AiInspectElement<
useModel?: 'coze' | 'openAI';
quickAnswer?: AISingleElementResponse;
}) {
const { context, multi, targetElementDescription, callAI, useModel } =
options;
const { context, multi, targetElementDescription, callAI } = options;
const { screenshotBase64, screenshotBase64WithElementMarker } = context;
const { description, elementById, elementByPosition } =
await describeUserPage(context);
@ -152,7 +151,6 @@ ${JSON.stringify({
const res = await callAI({
msgs,
AIActionType: AIActionType.INSPECT_ELEMENT,
useModel,
});
return {
parseResult: transformElementPositionToId(res.content, context.content),
@ -165,7 +163,6 @@ ${JSON.stringify({
const inspectElement = await callAiFn<AIElementResponse>({
msgs,
AIActionType: AIActionType.INSPECT_ELEMENT,
useModel,
});
return {
@ -231,7 +228,6 @@ DATA_DEMAND ends.
const result = await callAiFn<AISectionParseResponse<T>>({
msgs,
useModel,
AIActionType: AIActionType.EXTRACT_DATA,
});
return {
@ -285,7 +281,6 @@ export async function AiAssert<
const { content: assertResult, usage } = await callAiFn<AIAssertionResponse>({
msgs,
AIActionType: AIActionType.ASSERT,
useModel,
});
return {
content: assertResult,

View File

@ -118,13 +118,12 @@ export async function callToGetJSONObject<T>(
// gpt-4o-2024-05-13 only supports json_object response format
let responseFormat:
| OpenAI.ChatCompletionCreateParams['response_format']
| OpenAI.ResponseFormatJSONObject = {
type: AIResponseFormat.JSON,
};
| OpenAI.ResponseFormatJSONObject
| undefined;
const model = getModelName();
if (model === 'gpt-4o-2024-08-06') {
if (model.includes('gpt-4o')) {
switch (AIActionTypeValue) {
case AIActionType.ASSERT:
responseFormat = assertSchema;
@ -140,10 +139,10 @@ export async function callToGetJSONObject<T>(
responseFormat = planSchema;
break;
}
}
if (model.startsWith('gemini')) {
responseFormat = { type: AIResponseFormat.TEXT };
if (model === 'gpt-4o-2024-05-13') {
responseFormat = { type: AIResponseFormat.JSON };
}
}
const safeJsonParse = (input: string) => {
@ -162,7 +161,7 @@ export async function callToGetJSONObject<T>(
try {
return { content: JSON.parse(jsonContent), usage: response.usage };
} catch {
throw Error(`parse json error: ${response.content}`);
throw Error(`failed to parse json response: ${response.content}`);
}
}

View File

@ -34,14 +34,14 @@ You are a versatile professional in software UI automation. Your outstanding con
## Objective
- Decompose the task user asked into a series of actions
- Decompose the instruction user asked into a series of actions
- Locate the target element if possible
- If the task cannot be accomplished, give a further plan.
- If the instruction cannot be accomplished, give a further plan.
## Workflow
1. Receive the user's element description, screenshot, and instruction.
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / Error / Sleep). The "About the action" section below will give you more details.
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep). The "About the action" section below will give you more details.
3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
5. Consider whether the user's instruction will be accomplished after all the actions
@ -52,7 +52,8 @@ You are a versatile professional in software UI automation. Your outstanding con
- All the actions you composed MUST be based on the page context information you get.
- Trust the "What have been done" field about the task (if any), don't repeat actions in it.
- If you cannot plan any actions at all, consider the page content is irrelevant to the task. Put the error message in the \`error\` field.
- Respond only with valid JSON. Do not write an introduction or summary.
- If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
## About the \`actions\` field
@ -77,10 +78,20 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
* \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
- type: 'KeyboardPress', press a key
* { param: { value: string } }
- type: 'Scroll'
* { param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' } }
- type: 'Error'
* { param: { message: string } }
- type: 'Scroll', scroll up or down.
* {
locate: LocateParam | null,
param: {
direction: 'down'(default) | 'up' | 'right' | 'left',
scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
distance: null | number
}
}
* To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
* \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
- type: 'FalsyConditionStatement'
* { param: null }
* use this action when the instruction is an "if" statement and the condition is falsy.
- type: 'Sleep'
* { param: { timeMs: number } }
@ -94,7 +105,8 @@ Each action has a \`type\` and corresponding \`param\`. To be detailed:
## Output JSON Format:
Please return the result in JSON format as follows:
The JSON format is as follows:
{
"actions": [
{
@ -152,6 +164,7 @@ By viewing the page screenshot and description, you should consider this and out
"locate": null
},
],
"error": null,
"taskWillBeAccomplished": false,
"furtherPlan": {
"whatToDoNext": "find the 'English' option and click on it",
@ -160,7 +173,39 @@ By viewing the page screenshot and description, you should consider this and out
}
\`\`\`
## Example #2 : When task is accomplished, don't plan more actions
## Example #2 : Tolerate the error situation only when the instruction is an "if" statement
If the user says "If there is a popup, close it", you should consider this and output the JSON:
* By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
* The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyConditionStatement\` action.
\`\`\`json
{
"actions": [{
"thought": "There is no popup on the page",
"type": "FalsyConditionStatement",
"param": null
}
],
"taskWillBeAccomplished": true,
"furtherPlan": null
}
\`\`\`
For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
\`\`\`json
{
"actions": [],
"error": "The instruction and page context are irrelevant, there is no popup on the page",
"taskWillBeAccomplished": true,
"furtherPlan": null
}
\`\`\`
## Example #3 : When task is accomplished, don't plan more actions
When the user ask to "Wait 4s", you should consider this:
@ -234,7 +279,7 @@ export const planSchema: ResponseFormatJSONSchema = {
param: {
type: ['object', 'null'],
description:
'Parameter towards the task type, can be null only when the type field is Tap or Hover',
'Parameter of the action, can be null ONLY when the type field is Tap or Hover',
},
locate: {
type: ['object', 'null'],

View File

@ -243,6 +243,7 @@ export interface PlanningAction<ParamType = any> {
| 'KeyboardPress'
| 'Scroll'
| 'Error'
| 'FalsyConditionStatement'
| 'Assert'
| 'AssertWithoutThrow'
| 'Sleep';
@ -269,11 +270,9 @@ export interface PlanningActionParamInputOrKeyPress {
value: string;
}
export interface PlanningActionParamScroll {
scrollType:
| 'scrollUntilTop'
| 'scrollUntilBottom'
| 'scrollUpOneScreen'
| 'scrollDownOneScreen';
direction: 'down' | 'up' | 'right' | 'left';
scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft';
distance: null | number;
}
export interface PlanningActionParamAssert {

View File

@ -1,12 +1,12 @@
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
exports[`automation - planning openAI > basic run 1`] = `
exports[`automation - planning > basic run 1`] = `
{
"timeMs": 3500,
}
`;
exports[`automation - planning openAI > basic run 2`] = `
exports[`automation - planning > basic run 2`] = `
{
"value": "Enter",
}

View File

@ -1,7 +1,6 @@
import { plan } from '@/ai-model';
/* eslint-disable max-lines-per-function */
import { describe, expect, it, vi } from 'vitest';
import { modelList } from '../util';
import { getPageDataOfTestName } from './test-suite/util';
vi.setConfig({
@ -9,42 +8,113 @@ vi.setConfig({
hookTimeout: 30 * 1000,
});
modelList.forEach((model) => {
describe(`automation - planning ${model}`, () => {
it('basic run', async () => {
const { context } = await getPageDataOfTestName('todo');
describe('automation - planning', () => {
it('basic run', async () => {
const { context } = await getPageDataOfTestName('todo');
const { actions } = await plan(
'type "Why is the earth a sphere?", wait 3.5s, hit Enter',
{
context,
},
model,
);
expect(actions.length).toBe(3);
expect(actions[0].type).toBe('Input');
expect(actions[1].type).toBe('Sleep');
expect(actions[1].param).toMatchSnapshot();
expect(actions[2].type).toBe('KeyboardPress');
expect(actions[2].param).toMatchSnapshot();
});
const { actions } = await plan(
'type "Why is the earth a sphere?", wait 3.5s, hit Enter',
{
context,
},
);
it('instructions of to-do mvc', async () => {
const { context } = await getPageDataOfTestName('todo');
const instructions = [
'在任务框 input 输入 今天学习 JS按回车键',
'在任务框 input 输入 明天学习 Rust按回车键',
'在任务框 input 输入后天学习 AI按回车键',
'将鼠标移动到任务列表中的第二项,点击第二项任务右边的删除按钮',
'点击第二条任务左边的勾选按钮',
'点击任务列表下面的 completed 状态按钮',
];
expect(actions.length).toBe(3);
expect(actions[0].type).toBe('Input');
expect(actions[1].type).toBe('Sleep');
expect(actions[1].param).toMatchSnapshot();
expect(actions[2].type).toBe('KeyboardPress');
expect(actions[2].param).toMatchSnapshot();
});
for (const instruction of instructions) {
const { actions } = await plan(instruction, { context }, model);
expect(actions).toBeTruthy();
expect(actions[0].locate?.id).toBeTruthy();
}
});
it('instructions of to-do mvc', async () => {
const { context } = await getPageDataOfTestName('todo');
const instructions = [
'在任务框 input 输入 今天学习 JS按回车键',
'在任务框 input 输入 明天学习 Rust按回车键',
'在任务框 input 输入后天学习 AI按回车键',
'将鼠标移动到任务列表中的第二项,点击第二项任务右边的删除按钮',
'点击第二条任务左边的勾选按钮',
'点击任务列表下面的 completed 状态按钮',
];
for (const instruction of instructions) {
const { actions } = await plan(instruction, { context });
expect(actions).toBeTruthy();
expect(actions[0].locate?.id).toBeTruthy();
}
});
it('scroll some element', async () => {
const { context } = await getPageDataOfTestName('todo');
const { actions } = await plan(
'Scroll left the status filters (with a button named "complete")',
{
context,
},
);
expect(actions).toBeTruthy();
expect(actions[0].type).toBe('Scroll');
expect(actions[0].locate).toBeTruthy();
});
it('scroll page', async () => {
const { context } = await getPageDataOfTestName('todo');
const { actions } = await plan(
'Scroll down the page by 200px, scroll up the page by 100px, scroll right the second item of the task list by 300px',
{ context },
);
expect(actions.length).toBe(3);
expect(actions).toBeTruthy();
expect(actions[0].type).toBe('Scroll');
expect(actions[0].locate).toBeNull();
expect(actions[0].param).toBeDefined();
expect(actions[2].locate).toBeTruthy();
expect(actions[2].param).toBeDefined();
});
it('throw error when instruction is not feasible', async () => {
const { context } = await getPageDataOfTestName('todo');
await expect(async () => {
await plan('close Cookie Prompt', {
context,
});
}).rejects.toThrow();
});
it('should not throw in an "if" statement', async () => {
const { context } = await getPageDataOfTestName('todo');
const { actions, error } = await plan(
'If there is a cookie prompt, close it',
{ context },
);
expect(actions.length === 1).toBeTruthy();
expect(actions[0]!.type).toBe('FalsyConditionStatement');
});
it('should give a further plan when something is not found', async () => {
const { context } = await getPageDataOfTestName('todo');
const res = await plan(
'click the input box, wait 300ms, click the close button of the cookie prompt',
{ context },
);
// console.log(res);
expect(res.furtherPlan).toBeTruthy();
expect(res.furtherPlan?.whatToDoNext).toBeTruthy();
expect(res.furtherPlan?.whatHaveDone).toBeTruthy();
});
it('partial error', async () => {
const { context } = await getPageDataOfTestName('todo');
const res = await plan(
'click the input box, click the close button of the cookie prompt',
{ context },
);
expect(res.furtherPlan).toBeTruthy();
expect(res.furtherPlan?.whatToDoNext).toBeTruthy();
expect(res.furtherPlan?.whatHaveDone).toBeTruthy();
});
});

View File

@ -5,8 +5,8 @@ import { describe, expect, it, vi } from 'vitest';
vi.setConfig({
testTimeout: 20 * 1000,
});
describe('openai', () => {
it('basic', async () => {
describe('openai sdk connectivity', () => {
it('connectivity', async () => {
const result = await call([
{
role: 'system',
@ -37,4 +37,27 @@ describe('openai', () => {
);
expect(result.content.answer).toBe(15);
});
it('image input', async () => {
const result = await call([
{
role: 'user',
content: [
{
type: 'text',
text: 'Describe this image in one sentence.',
},
{
type: 'image_url',
image_url: {
url: 'https://portal.volccdn.com/obj/volcfe/bee_prod/biz_950/tos_38e6e81e1366482ed046045e72b0684d.png',
detail: 'high',
},
},
],
},
]);
expect(result.content.length).toBeGreaterThan(10);
});
});

View File

@ -322,7 +322,10 @@ export const generateAnimationScripts = (
});
insightOnTop = true;
}
} else if (task.type === 'Action') {
} else if (
task.type === 'Action' &&
task.subType !== 'FalsyConditionStatement'
) {
const title = typeStr(task);
const subTitle = paramStr(task);
scripts.push(pointerScript(mousePointer, title, subTitle));

View File

@ -104,31 +104,63 @@ export class Page implements AbstractPage {
}
// Scroll to top element
async scrollUntilTop(): Promise<void> {
async scrollUntilTop(distance?: number): Promise<void> {
const { height } = await this.browser.getWindowSize();
const scrollDistance = distance || height * 0.7;
await this.mouseWheel(0, height, 100);
await this.mouseWheel(0, -scrollDistance, 100);
}
// Scroll to bottom element
async scrollUntilBottom(): Promise<void> {
async scrollUntilBottom(distance?: number): Promise<void> {
const { height } = await this.browser.getWindowSize();
const scrollDistance = distance || height * 0.7;
await this.mouseWheel(0, -height, 100);
await this.mouseWheel(0, scrollDistance, 100);
}
async scrollUntilLeft(distance?: number): Promise<void> {
const { width } = await this.browser.getWindowSize();
const scrollDistance = distance || width * 0.7;
await this.mouseWheel(-scrollDistance, 0, 100);
}
async scrollUntilRight(distance?: number): Promise<void> {
const { width } = await this.browser.getWindowSize();
const scrollDistance = distance || width * 0.7;
await this.mouseWheel(scrollDistance, 0, 100);
}
// Scroll up one screen
async scrollUpOneScreen(): Promise<void> {
async scrollUp(distance?: number): Promise<void> {
const { height } = await this.browser.getWindowSize();
const scrollDistance = distance || height * 0.7;
await this.mouseWheel(0, height, 1000);
await this.mouseWheel(0, -scrollDistance, 1000);
}
// Scroll down one screen
async scrollDownOneScreen(): Promise<void> {
async scrollDown(distance?: number): Promise<void> {
const { height } = await this.browser.getWindowSize();
const scrollDistance = distance || height * 0.7;
await this.mouseWheel(0, -height, 1000);
await this.mouseWheel(0, scrollDistance, 1000);
}
async scrollLeft(distance?: number): Promise<void> {
const { width } = await this.browser.getWindowSize();
const scrollDistance = distance || width * 0.7;
await this.mouseWheel(-scrollDistance, 0, 1000);
}
async scrollRight(distance?: number): Promise<void> {
const { width } = await this.browser.getWindowSize();
const scrollDistance = distance || width * 0.7;
await this.mouseWheel(scrollDistance, 0, 1000);
}
private async keyboardType(text: string): Promise<void> {

View File

@ -192,22 +192,36 @@ export default class ChromeExtensionProxyPage implements AbstractPage {
return this.mouse.wheel(0, 9999999);
}
async scrollUpOneScreen() {
await chrome.scripting.executeScript({
target: { tabId: this.tabId, allFrames: true },
func: () => {
window.scrollBy(0, -window.innerHeight * 0.7);
},
});
async scrollUntilLeft() {
return this.mouse.wheel(-9999999, 0);
}
async scrollDownOneScreen() {
await chrome.scripting.executeScript({
target: { tabId: this.tabId, allFrames: true },
func: () => {
window.scrollBy(0, window.innerHeight * 0.7);
},
});
async scrollUntilRight() {
return this.mouse.wheel(9999999, 0);
}
async scrollUp(distance?: number) {
const { height } = await this.size();
const scrollDistance = distance || height * 0.7;
return this.mouse.wheel(0, -scrollDistance);
}
async scrollDown(distance?: number) {
const { height } = await this.size();
const scrollDistance = distance || height * 0.7;
return this.mouse.wheel(0, scrollDistance);
}
async scrollLeft(distance?: number) {
const { width } = await this.size();
const scrollDistance = distance || width * 0.7;
return this.mouse.wheel(-scrollDistance, 0);
}
async scrollRight(distance?: number) {
const { width } = await this.size();
const scrollDistance = distance || width * 0.7;
return this.mouse.wheel(scrollDistance, 0);
}
async clearInput(element: ElementInfo) {
@ -309,60 +323,3 @@ export default class ChromeExtensionProxyPage implements AbstractPage {
await this.detachDebugger();
}
}
// backup: some implementation by chrome extension API instead of CDP
// async function getPageContentOfTab(tabId: number): Promise<{
// context: ElementInfo[];
// size: { width: number; height: number; dpr: number };
// }> {
// await chrome.scripting.executeScript({
// target: {
// tabId,
// allFrames: true,
// },
// files: [scriptFileToRetrieve],
// });
// // call and retrieve the result
// const returnValue = await chrome.scripting.executeScript({
// target: { tabId, allFrames: true },
// func: () => {
// return {
// context: (
// window as any
// ).midscene_element_inspector.webExtractTextWithPosition(),
// size: {
// width: document.documentElement.clientWidth,
// height: document.documentElement.clientHeight,
// dpr: window.devicePixelRatio,
// },
// };
// },
// });
// console.log('returnValue', returnValue);
// if (!returnValue[0].result) {
// throw new Error(`Failed to get active page content of tabId: ${tabId}`);
// }
// return returnValue[0].result;
// }
// async function getSizeInfoOfTab(tabId: number): Promise<{
// dpr: number;
// width: number;
// height: number;
// }> {
// const returnValue = await chrome.scripting.executeScript({
// target: { tabId, allFrames: false },
// func: () => {
// return {
// dpr: window.devicePixelRatio,
// width: document.documentElement.clientWidth,
// height: document.documentElement.clientHeight,
// };
// },
// });
// // console.log('returnValue of getScreenInfoOfTab', returnValue);
// return returnValue[0].result!;
// }

View File

@ -313,27 +313,42 @@ export class PageTaskExecutor {
param: plan.param,
thought: plan.thought,
locate: plan.locate,
executor: async (taskParam) => {
const scrollToEventName = taskParam.scrollType;
switch (scrollToEventName) {
case 'scrollUntilTop':
await this.page.scrollUntilTop();
break;
case 'scrollUntilBottom':
await this.page.scrollUntilBottom();
break;
case 'scrollUpOneScreen':
await this.page.scrollUpOneScreen();
break;
case 'scrollDownOneScreen':
await this.page.scrollDownOneScreen();
break;
default:
console.error(
'Unknown scroll event type:',
scrollToEventName,
executor: async (taskParam, { element }) => {
if (element) {
await this.page.mouse.move(
element.center[0],
element.center[1],
);
}
const scrollToEventName = taskParam?.scrollType;
if (scrollToEventName === 'untilTop') {
await this.page.scrollUntilTop();
} else if (scrollToEventName === 'untilBottom') {
await this.page.scrollUntilBottom();
} else if (scrollToEventName === 'untilRight') {
await this.page.scrollUntilRight();
} else if (scrollToEventName === 'untilLeft') {
await this.page.scrollUntilLeft();
} else if (scrollToEventName === 'once') {
if (taskParam.direction === 'down') {
await this.page.scrollDown(taskParam.distance || undefined);
} else if (taskParam.direction === 'up') {
await this.page.scrollUp(taskParam.distance || undefined);
} else if (taskParam.direction === 'left') {
await this.page.scrollLeft(taskParam.distance || undefined);
} else if (taskParam.direction === 'right') {
await this.page.scrollRight(taskParam.distance || undefined);
} else {
throw new Error(
`Unknown scroll direction: ${taskParam.direction}`,
);
}
} else {
throw new Error(
`Unknown scroll event type: ${scrollToEventName}, taskParam: ${JSON.stringify(
taskParam,
)}`,
);
}
},
};
@ -364,6 +379,19 @@ export class PageTaskExecutor {
},
};
tasks.push(taskActionError);
} else if (plan.type === 'FalsyConditionStatement') {
const taskActionFalsyConditionStatement: ExecutionTaskActionApply<null> =
{
type: 'Action',
subType: 'FalsyConditionStatement',
param: null,
thought: plan.thought,
locate: plan.locate,
executor: async () => {
// console.warn(`[warn]falsy condition: ${plan.thought}`);
},
};
tasks.push(taskActionFalsyConditionStatement);
} else {
throw new Error(`Unknown or unsupported task type: ${plan.type}`);
}
@ -512,6 +540,11 @@ export class PageTaskExecutor {
return this.appendErrorPlan(taskExecutor, errorMsg);
}
if (replanCount > 0) {
// add a brief sleep to wait for the page to be ready
await sleep(300);
}
// plan
await taskExecutor.append(planningTask);
const planResult: PlanningAIResponse = await taskExecutor.flush();
@ -524,26 +557,6 @@ export class PageTaskExecutor {
const plans = planResult.actions;
// check if their is nothing but a locate will null task
// const validPlans = plans.filter((plan: PlanningAction) => {
// if (plan.type === 'Locate' && !plan.param?.id) {
// return false;
// }
// return plan.type !== 'Plan';
// });
// if (validPlans.length === 0) {
// if (replanCount === 0) {
// return this.appendErrorPlan(
// taskExecutor,
// `No valid plans found, cannot proceed: ${userPrompt}`,
// );
// }
// return this.appendErrorPlan(
// taskExecutor,
// `Cannot proceed after several steps, please check the report: ${userPrompt}`,
// );
// }
let executables: Awaited<ReturnType<typeof this.convertPlanToExecutable>>;
try {
executables = await this.convertPlanToExecutable(plans, cacheGroup);

View File

@ -1,10 +1,12 @@
import type {
ExecutionTask,
ExecutionTaskAction,
ExecutionTaskActionApply,
ExecutionTaskInsightAssertion,
ExecutionTaskInsightLocate,
ExecutionTaskInsightQuery,
ExecutionTaskPlanning,
PlanningActionParamScroll,
} from '@midscene/core';
export function typeStr(task: ExecutionTask) {
@ -27,8 +29,23 @@ export function paramStr(task: ExecutionTask) {
if (task.type === 'Action') {
const sleepMs = (task as ExecutionTaskAction)?.param?.timeMs;
const scrollType = (
task as ExecutionTask<ExecutionTaskActionApply<PlanningActionParamScroll>>
)?.param?.scrollType;
if (sleepMs) {
value = `${sleepMs}ms`;
} else if (scrollType) {
const scrollDirection = (
task as ExecutionTask<
ExecutionTaskActionApply<PlanningActionParamScroll>
>
)?.param?.direction;
const scrollDistance = (
task as ExecutionTask<
ExecutionTaskActionApply<PlanningActionParamScroll>
>
)?.param?.distance;
value = `${scrollDirection}, ${scrollType}, ${scrollDistance || 'distance-not-set'}`;
} else {
value =
(task as ExecutionTaskAction)?.param?.value ||

View File

@ -35,8 +35,12 @@ export abstract class AbstractPage {
abstract scrollUntilTop(): Promise<void>;
abstract scrollUntilBottom(): Promise<void>;
abstract scrollUpOneScreen(): Promise<void>;
abstract scrollDownOneScreen(): Promise<void>;
abstract scrollUntilLeft(): Promise<void>;
abstract scrollUntilRight(): Promise<void>;
abstract scrollUp(distance?: number): Promise<void>;
abstract scrollDown(distance?: number): Promise<void>;
abstract scrollLeft(distance?: number): Promise<void>;
abstract scrollRight(distance?: number): Promise<void>;
abstract _forceUsePageContext?(): Promise<WebUIContext>;

View File

@ -47,12 +47,28 @@ export default class StaticPage implements AbstractPage {
return ThrowNotImplemented('scrollUntilBottom');
}
async scrollUpOneScreen() {
return ThrowNotImplemented('scrollUpOneScreen');
async scrollUntilLeft() {
return ThrowNotImplemented('scrollUntilLeft');
}
async scrollDownOneScreen() {
return ThrowNotImplemented('scrollDownOneScreen');
async scrollUntilRight() {
return ThrowNotImplemented('scrollUntilRight');
}
async scrollUp(distance?: number) {
return ThrowNotImplemented('scrollUp');
}
async scrollDown(distance?: number) {
return ThrowNotImplemented('scrollDown');
}
async scrollLeft(distance?: number) {
return ThrowNotImplemented('scrollLeft');
}
async scrollRight(distance?: number) {
return ThrowNotImplemented('scrollRight');
}
async clearInput() {

View File

@ -136,19 +136,38 @@ export class Page<
scrollUntilTop(): Promise<void> {
return this.mouse.wheel(0, -9999999);
}
scrollUntilBottom(): Promise<void> {
return this.mouse.wheel(0, 9999999);
}
async scrollUpOneScreen(): Promise<void> {
const innerHeight = await this.evaluate(() => window.innerHeight);
const distance = innerHeight * 0.7;
await this.mouse.wheel(0, -distance);
scrollUntilLeft(): Promise<void> {
return this.mouse.wheel(-9999999, 0);
}
async scrollDownOneScreen(): Promise<void> {
scrollUntilRight(): Promise<void> {
return this.mouse.wheel(9999999, 0);
}
async scrollUp(distance?: number): Promise<void> {
const innerHeight = await this.evaluate(() => window.innerHeight);
const distance = innerHeight * 0.7;
await this.mouse.wheel(0, distance);
const scrollDistance = distance || innerHeight * 0.7;
await this.mouse.wheel(0, -scrollDistance);
}
async scrollDown(distance?: number): Promise<void> {
const innerHeight = await this.evaluate(() => window.innerHeight);
const scrollDistance = distance || innerHeight * 0.7;
await this.mouse.wheel(0, scrollDistance);
}
async scrollLeft(distance?: number): Promise<void> {
const innerWidth = await this.evaluate(() => window.innerWidth);
const scrollDistance = distance || innerWidth * 0.7;
await this.mouse.wheel(-scrollDistance, 0);
}
async scrollRight(distance?: number): Promise<void> {
const innerWidth = await this.evaluate(() => window.innerWidth);
const scrollDistance = distance || innerWidth * 0.7;
await this.mouse.wheel(scrollDistance, 0);
}
async destroy(): Promise<void> {

View File

@ -0,0 +1,36 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Scroll Page Demo</title>
<style>
body {
font-family: Arial, sans-serif;
margin: 20px;
}
</style>
</head>
<body>
<h1>Vertical Scroll</h1>
<div style="width: 300px; height: 300px; background-color: #ccc; overflow-y: scroll;">
<div style="width: 100px; height: 100px; background-color: #aaa;">Vertical 1</div>
<div style="width: 100px; height: 100px; background-color: #bbb;">Vertical 2</div>
<div style="width: 100px; height: 100px; background-color: #ccc;">Vertical 3</div>
<div style="width: 100px; height: 100px; background-color: #ddd;">Vertical 4</div>
<div style="width: 100px; height: 100px; background-color: #eee;">Vertical 5</div>
</div>
<h1>Horizontal Scroll</h1>
<div style="width: 300px; height: 100px; background-color: #ccc; overflow-x: scroll; white-space: nowrap;">
<div style="width: 100px; height: 100px; background-color: #aaa; display: inline-block;">Horizontal 1</div>
<div style="width: 100px; height: 100px; background-color: #bbb; display: inline-block;">Horizontal 2</div>
<div style="width: 100px; height: 100px; background-color: #ccc; display: inline-block;">Horizontal 3</div>
<div style="width: 100px; height: 100px; background-color: #ddd; display: inline-block;">Horizontal 4</div>
<div style="width: 100px; height: 100px; background-color: #eee; display: inline-block;">Horizontal 5</div>
</div>
</body>
</html>

View File

@ -1,3 +1,4 @@
import path from 'node:path';
import { PuppeteerAgent } from '@/puppeteer';
import { describe, expect, it, vi } from 'vitest';
import { launchPage } from './utils';
@ -84,7 +85,7 @@ describe(
await reset();
});
it('Search', async () => {
it('search engine', async () => {
const { originPage, reset } = await launchPage('https://www.baidu.com/');
const mid = new PuppeteerAgent(originPage);
await mid.aiAction(
@ -95,6 +96,31 @@ describe(
await reset();
});
it('scroll', async () => {
const htmlPath = path.join(__dirname, 'scroll.html');
const { originPage, reset } = await launchPage(`file://${htmlPath}`);
// const { originPage, reset } = await launchPage('https://news.baidu.com/');
const mid = new PuppeteerAgent(originPage);
await mid.aiAction(
'find the "Vertical 2" element, scroll down 200px, find the "Horizontal 2" element, scroll right 100px',
);
await mid.aiAssert(
'the "Horizontal 2", "Horizontal 4" and "Vertical 5" elements are visible',
);
await reset();
});
it.skip('Playground', async () => {
const { originPage, reset } = await launchPage('https://www.baidu.com/');
const mid = new PuppeteerAgent(originPage);
// await mid.aiAction('Close the cookie prompt');
await mid.aiAction(
'Type "AI 101" in search box, hit Enter, wait 2s. If there is a cookie prompt, close it',
);
await reset();
});
},
{
timeout: 180 * 1000,

View File

@ -118,7 +118,7 @@ describe(
height: 200,
},
});
await page.scrollDownOneScreen();
await page.scrollDown();
await new Promise((resolve) => setTimeout(resolve, 1000));
await generateExtractData(
page,