fix: over planning for Qwen in page with form (#429)

This commit is contained in:
yuyutaotao 2025-02-27 16:44:01 +08:00 committed by GitHub
parent 881adab4fc
commit a6ffbd07b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
29 changed files with 60486 additions and 69 deletions

View File

@ -83,7 +83,7 @@ import { AgentOverChromeBridge } from "@midscene/web/bridge-mode";
const agent = new AgentOverChromeBridge();
```
Except [the normal parameters in the agent constructor](./api), `AgentOverChromeBridge` provides one more parameter:
Except [the normal parameters in the agent constructor](./api), `AgentOverChromeBridge` accepts one more parameter:
* `closeNewTabsAfterDisconnect?: boolean`: If true, the newly created tab will be closed when the bridge is destroyed. Default is false.

View File

@ -32,7 +32,35 @@ Good ✅: Split the task into three steps:
"Click Sign up button"
"Fill the form with 'test@test.com' in the email field, 'test' in the password field, and click Sign up button"
## LLMs can NOT tell the exact number like coords or hex-style color, give it some choices
### Understand the reason why AI is wrong, and optimize the prompt
This prompt may cause the click to fail:
⚠️ Click the "include" in the "range" dropdown menu
After checking the report, you will find that the AI may tend to open the floating layer first, and then find the "include" option. If the floating layer is already open, you can try:
✅ The floating layer is open, please click the "include" option
Another example:
This may fail when there are many "Add" buttons on the page, or the button is an icon button:
⚠️ Click the "Add" button
You can try:
✅ Click the "Add" button on the top-right corner, it's a button with a "+" icon, on the right side of the "range" dropdown menu
If the button is too large, the AI may misjudge the clickable range:
⚠️ Click the "User Register" menu
You can try:
✅ Click the "User Register" text in the left menu
### LLMs can NOT tell the exact number like coords or hex-style color, give it some choices
For example:

View File

@ -83,7 +83,7 @@ import { AgentOverChromeBridge } from "@midscene/web/bridge-mode";
const agent = new AgentOverChromeBridge();
```
除了 [普通 Agent 构造器](./api) 的参数,`AgentOverChromeBridge` 还提供了以下参数:
除了 [普通 Agent 构造器](./api) 的参数,`AgentOverChromeBridge` 还接受以下参数:
* `closeNewTabsAfterDisconnect?: boolean`: 如果为 true当桥接断开时所有新创建的标签页都将被自动关闭。默认值为 false。

View File

@ -31,6 +31,34 @@
"点击注册按钮"
"在表单中输入'test@test.com'作为邮箱,'test'作为密码,然后点击注册按钮"
### 理解 AI 出错的原因,调优指令
这条指令可能会导致点击失败:
⚠️ 点击 Range 浮层中的 "include" 选项
查看运行报告后,你会发现 AI 可能倾向于先去打开浮层、然后再寻找 include 选项。如果此时浮层已经打开,可以尝试:
✅ 浮层已经展开,请点击 "include" 选项
另一个例子:
这条指令可能会在有很多 "Add" 按钮的页面中失败,或者按钮是一个图标按钮、缺少文本时:
⚠️ 点击 "Add" 按钮
你可以尝试:
✅ 点击页面右上角的 "Add" 按钮,它是一个带有 "+" 图标的按钮,位于 "range" 下拉菜单的右侧
如果按钮尺寸太大AI 可能误判按钮的可点击范围:
⚠️ 点击 "用户注册" 菜单
你可以尝试:
✅ 点击左侧菜单中的 "用户注册" 文字
### LLM 无法准确辨别数值(比如坐标或十六进制颜色值),不妨提供一些选项
例如:

View File

@ -26,9 +26,11 @@ OPENAI_API_KEY="sk-replace-by-your-own"
## 开始体验
配置完成后,你可以立即开始使用 Midscene。你可以通过 Action 与网页进行交互,利用 Query 提取 JSON 数据,或通过 Assert 进行断言。
配置完成后,你可以立即开始使用 Midscene。它一共有三个关键操作 Tab
同时,你会发现插件会提供操作回放功能以及一份报告文件供查看。这份报告与你从自动化脚本所得的报告是相同的。
- **Action**: 与网页进行交互,如 "在搜索框中输入 Midscene" 或 "点击登录按钮"。
- **Query**: 从界面中提取 JSON 数据,如 "提取页面中的用户 ID返回 \{ id: string \}"。
- **Assert**: 执行断言,如 "页面标题是 Midscene"。
快来试试吧!

View File

@ -95,6 +95,18 @@ test('antd widget - carousel', async ({ page }) => {
);
});
test('antd widget - form', async ({ page }) => {
const playwrightPage = new PlaywrightWebPage(page);
await page.setViewportSize({ width: 1400, height: 1080 });
await page.goto(
'https://ant-design.antgroup.com/components/form-cn#form-demo-register',
);
await page.waitForLoadState('networkidle');
await generateExtractData(playwrightPage, generateTestDataPath('antd-form'));
});
test('heytea online order', async ({ page, ai }) => {
const playwrightPage = new PlaywrightWebPage(page);

View File

@ -0,0 +1,41 @@
{
"testDataPath": "antd-form",
"testCases": [
{
"prompt": "首先,在 Email 输入框中输入'test@test.com'",
"response_planning": {
"action": {
"type": "Input",
"locate": {
"prompt": "Email input field",
"bbox": [563, 214, 965, 240]
},
"param": {
"value": "test@test.com"
}
},
"log": "在 Email 输入框中输入 'test@test.com'",
"more_actions_needed_by_instruction": false,
"finish": true,
"actions": [
{
"type": "Input",
"locate": {
"prompt": "Email input field",
"bbox": [563, 214, 965, 240]
},
"param": {
"value": "test@test.com"
}
}
],
"rawResponse": "{\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 563,\n 214,\n 965,\n 240\n ],\n \"prompt\": \"Email input field\"\n },\n \"param\": {\n \"value\": \"test@test.com\"\n }\n },\n \"log\": \"在 Email 输入框中输入 'test@test.com'\",\n \"more_actions_needed_by_instruction\": false\n}",
"usage": {
"prompt_tokens": 2541,
"completion_tokens": 105,
"total_tokens": 2646
}
}
}
]
}

View File

@ -0,0 +1,41 @@
{
"testDataPath": "aweme-login",
"testCases": [
{
"prompt": "type 'user' in the username input box, type '123456' in the password input box",
"log": "type 'user' in the username input box",
"response_planning": {
"action": {
"type": "Input",
"locate": {
"prompt": "password input box",
"bbox": [493, 417, 786, 465]
},
"param": {
"value": "123456"
}
},
"log": "type '123456' in the password input box",
"finish": true,
"actions": [
{
"type": "Input",
"locate": {
"prompt": "password input box",
"bbox": [493, 417, 786, 465]
},
"param": {
"value": "123456"
}
}
],
"rawResponse": "{\n \"action\": {\n \"type\": \"Input\",\n \"locate\": {\n \"bbox_2d\": [\n 493,\n 417,\n 786,\n 465\n ],\n \"prompt\": \"password input box\"\n },\n \"param\": {\n \"value\": \"123456\"\n }\n },\n \"log\": \"type '123456' in the password input box\",\n \"finish\": true\n}",
"usage": {
"prompt_tokens": 1971,
"completion_tokens": 108,
"total_tokens": 2079
}
}
}
]
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,473 @@
<>
<a id="hdoie" markerId="0" draggable="false" alt="logo" left="40" top="16" width="32" height="32">
</a>
<>
<span id="mfigm" markerId="1" left="84" top="19" width="93" height="26">
Ant Design
</span>
</>
</>
<>
<div id="hdakf" markerId="2" svgContent="true" left="309" top="26" width="13" height="13">
</div>
<input id="adjbb" markerId="3" class=".dumi-default-search-bar-input" placeholder="输入关键字搜索..." undefined="" left="293" top="21" width="280" height="22">
输入关键字搜索...
</input>
<>
<span id="cejmn" markerId="4" left="582" top="25" width="11" height="14">
</span>
<span id="loebp" markerId="5" left="593" top="25" width="11" height="14">
K
</span>
</>
</>
<>
<a id="jncdi" markerId="6" left="768" top="23" width="28" height="19">
设计
</a>
</>
<>
<a id="dmikg" markerId="7" left="832" top="23" width="28" height="19">
研发
</a>
</>
<>
<a id="mgalb" markerId="8" left="896" top="23" width="28" height="19">
组件
</a>
</>
<>
<a id="ffehn" markerId="9" left="960" top="23" width="28" height="19">
博客
</a>
</>
<>
<a id="bbjng" markerId="10" left="1024" top="23" width="28" height="19">
资源
</a>
</>
<>
<a id="pagch" markerId="11" left="1082" top="23" width="56" height="19">
国内镜像
</a>
</>
<>
<span id="ihlfa" markerId="12" left="1170" top="23" width="40" height="17">
5.24.2
</span>
</>
<div id="dinoj" markerId="13" type="button" class=".acss-a4e8ca" aria-describedby=":Rjlblj5cma:" left="1264" top="16" width="32" height="32">
En
</div>
<div id="nfpha" markerId="14" type="button" class=".acss-a4e8ca" aria-label="RTL Switch Button" aria-describedby=":Rklblj5cma:" left="1308" top="16" width="32" height="32">
</div>
<>
<a id="hmbld" markerId="15" type="button" class=".acss-a4e8ca" aria-describedby=":Rllblj5cma:" left="1352" top="16" width="32" height="32">
</a>
</>
<>
<span id="cgjbn" markerId="16" left="44" top="119" width="56" height="19">
组件总览
</span>
</>
<>
<div id="dbfhp" markerId="17" left="44" top="174" width="28" height="19">
通用
</div>
</>
<>
<span id="hdpka" markerId="18" left="44" top="230" width="42" height="19">
Button
</span>
</>
<>
<span id="injck" markerId="19" left="94" top="231" width="24" height="17">
按钮
</span>
</>
<>
<span id="blbpf" markerId="20" left="44" top="274" width="73" height="19">
FloatButton
</span>
</>
<>
<span id="pjlpj" markerId="21" left="125" top="275" width="48" height="17">
悬浮按钮
</span>
</>
<>
<span id="loofk" markerId="22" left="235" top="276" width="28" height="14">
5.0.0
</span>
</>
<>
<span id="lgmbn" markerId="23" left="44" top="318" width="27" height="19">
Icon
</span>
</>
<>
<span id="npcno" markerId="24" left="79" top="319" width="24" height="17">
图标
</span>
</>
<>
<span id="jhmgb" markerId="25" left="44" top="362" width="74" height="19">
Typography
</span>
</>
<>
<span id="bfjin" markerId="26" left="126" top="363" width="24" height="17">
排版
</span>
</>
<>
<div id="knnlf" markerId="27" left="44" top="417" width="28" height="19">
布局
</div>
</>
<>
<span id="milel" markerId="28" left="44" top="473" width="45" height="19">
Divider
</span>
</>
<>
<span id="haena" markerId="29" left="97" top="474" width="36" height="17">
分割线
</span>
</>
<>
<span id="emgpf" markerId="30" left="44" top="517" width="26" height="19">
Flex
</span>
</>
<>
<span id="bmoal" markerId="31" left="78" top="518" width="48" height="17">
弹性布局
</span>
</>
<>
<span id="kmnlc" markerId="32" left="229" top="519" width="34" height="14">
5.10.0
</span>
</>
<>
<span id="ljafl" markerId="33" left="44" top="561" width="27" height="19">
Grid
</span>
</>
<>
<span id="chihb" markerId="34" left="79" top="562" width="24" height="17">
栅格
</span>
</>
<>
<span id="fkpgm" markerId="35" left="44" top="605" width="42" height="19">
Layout
</span>
</>
<>
<span id="jbpeh" markerId="36" left="94" top="606" width="24" height="17">
布局
</span>
</>
<>
<span id="kpmmp" markerId="37" left="44" top="649" width="38" height="19">
Space
</span>
</>
<>
<span id="ioddf" markerId="38" left="90" top="650" width="24" height="17">
间距
</span>
</>
<>
<span id="bjjnc" markerId="39" left="44" top="693" width="45" height="19">
Splitter
</span>
</>
<>
<span id="kfneg" markerId="40" left="97" top="694" width="48" height="17">
分隔面板
</span>
</>
<>
<span id="ocapn" markerId="41" left="229" top="695" width="34" height="14">
5.21.0
</span>
</>
<>
<div id="lfpfh" markerId="42" left="44" top="748" width="28" height="19">
导航
</div>
</>
<>
<span id="bgkfb" markerId="43" left="44" top="804" width="45" height="19">
Anchor
</span>
</>
<>
<span id="ckjfc" markerId="44" left="97" top="805" width="24" height="17">
锚点
</span>
</>
<>
<span id="kjgcd" markerId="45" left="44" top="848" width="76" height="19">
Breadcrumb
</span>
</>
<>
<span id="lidno" markerId="46" left="128" top="849" width="36" height="17">
面包屑
</span>
</>
<>
<span id="hffmc" markerId="47" left="44" top="892" width="66" height="19">
Dropdown
</span>
</>
<>
<span id="neadm" markerId="48" left="118" top="893" width="48" height="17">
下拉菜单
</span>
</>
<>
<span id="pglgg" markerId="49" left="44" top="936" width="36" height="19">
Menu
</span>
</>
<>
<span id="elpjb" markerId="50" left="88" top="937" width="48" height="17">
导航菜单
</span>
</>
<>
<span id="mjlej" markerId="51" left="44" top="980" width="66" height="19">
Pagination
</span>
</>
<>
<span id="fmegh" markerId="52" left="118" top="981" width="24" height="17">
分页
</span>
</>
<>
<span id="nankd" markerId="53" left="44" top="1024" width="35" height="19">
Steps
</span>
</>
<>
<span id="jhhje" markerId="54" left="87" top="1025" width="36" height="17">
步骤条
</span>
</>
<>
<p id="gohgg" markerId="55" left="379" top="69" width="198" height="15">
🛎️ 想要 3 分钟实现登录表单?试试
</p>
<>
<a id="mibhe" markerId="56" left="577" top="69" width="99" height="15">
Pro Components
</a>
</>
<p id="bfoln" markerId="57" left="675" top="69" width="13" height="15">
</p>
</>
<>
<span id="nlhkj" markerId="58" svgContent="true" left="717" top="123" width="14" height="16">
</span>
</>
<>
<span id="hbnjn" markerId="59" svgContent="true" left="748" top="123" width="13" height="16">
</span>
</>
<>
<span id="fmipb" markerId="60" svgContent="true" left="780" top="124" width="14" height="14">
</span>
</>
<>
<span id="ohcjm" markerId="61" svgContent="true" left="814" top="126" width="11" height="11">
</span>
</>
<>
<div id="knmbn" markerId="62" alt="expand code" class=".code-expand-icon-show" left="844" top="123" width="16" height="16">
</div>
</>
<>
<label id="jmjcl" markerId="63" left="510" top="219" width="41" height="17">
E-mail
</label>
</>
<>
<input id="gjhcl" markerId="64" id="register_email" aria-required="true" class=".ant-input.ant-input-outlined.css-var-R3albtj5cma.ant-input-css-var" type="text" undefined="" left="565" top="211" width="400" height="32">
</input>
</>
<>
<label id="ckbcc" markerId="65" left="489" top="275" width="62" height="17">
Password
</label>
</>
<>
<input id="pakoh" markerId="66" id="register_password" aria-required="true" class=".ant-input" type="password" undefined="" left="577" top="272" width="358" height="22">
</input>
<>
<span id="diahb" markerId="67" svgContent="true" left="939" top="277" width="14" height="12">
</span>
</>
</>
<>
<label id="iejcf" markerId="68" left="433" top="331" width="117" height="17">
Confirm Password
</label>
</>
<>
<input id="pagka" markerId="69" id="register_confirm" aria-required="true" class=".ant-input" type="password" undefined="" left="577" top="328" width="358" height="22">
</input>
<>
<span id="odibg" markerId="70" svgContent="true" left="939" top="333" width="14" height="12">
</span>
</>
</>
<>
<label id="feiae" markerId="71" left="469" top="387" width="64" height="17">
Nickname
</label>
<>
<span id="kcing" markerId="72" svgContent="true" left="537" top="388" width="14" height="14">
</span>
</>
</>
<>
<input id="odmce" markerId="73" id="register_nickname" aria-required="true" class=".ant-input.ant-input-outlined.css-var-R3albtj5cma.ant-input-css-var" type="text" undefined="" left="565" top="379" width="400" height="32">
</input>
</>
<>
<label id="pklok" markerId="74" left="426" top="443" width="124" height="17">
Habitual Residence
</label>
</>
<>
<span id="alafb" markerId="75" left="577" top="443" width="212" height="17">
Zhejiang / Hangzhou / West Lake
</span>
</>
<>
<span id="kgmcm" markerId="76" svgContent="true" left="942" top="445" width="12" height="12">
</span>
</>
<>
<label id="amkno" markerId="77" left="454" top="499" width="96" height="17">
Phone Number
</label>
</>
<>
<span id="mmhgl" markerId="78" left="579" top="499" width="26" height="17">
+86
</span>
</>
<input id="cfbdf" markerId="79" id="register_phone" aria-required="true" class=".ant-input.ant-input-outlined" type="text" undefined="" left="636" top="491" width="329" height="32">
</input>
<>
<label id="kocgd" markerId="80" left="493" top="555" width="58" height="17">
Donation
</label>
</>
<>
<input id="ohijk" markerId="81" autocomplete="off" role="spinbutton" step="1" id="register_donation" aria-required="true" class=".ant-input-number-input" undefined="" left="566" top="548" width="327" height="30">
</input>
</>
<>
<input id="pgegh" markerId="82" id="register_suffix" autocomplete="off" class=".ant-select-selection-search-input" role="combobox" aria-expanded="false" aria-haspopup="listbox" aria-owns="register_suffix_list" aria-autocomplete="list" aria-controls="register_suffix_list" undefined="" unselectable="on" type="search" left="906" top="548" width="28" height="30">
</input>
</>
<>
<label id="manhi" markerId="83" left="498" top="611" width="52" height="17">
Website
</label>
</>
<>
<input id="nkdbd" markerId="84" autocomplete="off" id="register_website" role="combobox" aria-expanded="false" aria-haspopup="listbox" aria-owns="register_website_list" aria-autocomplete="list" aria-controls="register_website_list" aria-required="true" class=".ant-input.ant-input-outlined.ant-select-selection-search-input.css-var-R3albtj5cma.ant-input-css-var" type="search" undefined="" left="565" top="603" width="400" height="32">
</input>
</>
<>
<label id="fpklg" markerId="85" left="521" top="667" width="30" height="17">
Intro
</label>
</>
<>
<textarea id="aheoi" markerId="86" id="register_intro" aria-required="true" maxlength="100" class=".ant-input" undefined="" left="566" top="660" width="398" height="52">
</textarea>
<>
<span id="lkofc" markerId="87" left="920" top="715" width="44" height="17">
0 / 100
</span>
</>
</>
<>
<label id="kihpb" markerId="88" left="503" top="745" width="48" height="17">
Gender
</label>
</>
<>
<input id="nopbg" markerId="89" id="register_gender" autocomplete="off" class=".ant-select-selection-search-input" role="combobox" aria-expanded="false" aria-haspopup="listbox" aria-owns="register_gender_list" aria-autocomplete="list" aria-controls="register_gender_list" aria-required="true" undefined="" unselectable="on" type="search" left="577" top="738" width="358" height="30">
</input>
</>
<>
<label id="cpdgp" markerId="90" left="497" top="801" width="54" height="17">
Captcha
</label>
</>
<>
<input id="pgkmp" markerId="91" id="register_captcha" aria-required="true" class=".ant-input.ant-input-outlined.css-var-R3albtj5cma.ant-input-css-var" type="text" undefined="" left="565" top="793" width="196" height="32">
</input>
</>
<>
<div id="fdgje" markerId="92" type="button" class=".ant-btn.css-var-R3albtj5cma.ant-btn-default.ant-btn-color-default.ant-btn-variant-outlined" left="769" top="793" width="111" height="32">
Get captcha
</div>
</>
<>
<div id="kolii" markerId="93" left="565" top="828" width="275" height="17">
We must make sure that your are a human.
</div>
</>
<>
<input id="hjlfo" markerId="94" id="register_agreement" class=".ant-checkbox-input" type="checkbox" undefined="" left="565" top="881" width="14" height="14">
on
</input>
</>
<>
<span id="nailo" markerId="95" left="589" top="881" width="99" height="17">
I have read the
</span>
<>
<a id="mgdmi" markerId="96" left="688" top="881" width="69" height="17">
agreement
</a>
</>
</>
<>
<div id="fjkmi" markerId="97" type="submit" class=".ant-btn.css-var-R3albtj5cma.ant-btn-primary.ant-btn-color-primary.ant-btn-variant-solid" left="565" top="929" width="85" height="32">
Register
</div>
</>
<>
<a id="koicf" markerId="98" left="365" top="1029" width="70" height="17">
注册新用户
</a>
</>
<>
<span id="noeih" markerId="99" svgContent="true" left="444" top="1029" width="14" height="14">
</span>
</>
<>
<p id="fomgn" markerId="100" left="365" top="1060" width="224" height="17">
用户填写必须的信息以注册新用户。
</p>
</>
<>
<div id="hlleb" markerId="101" aria-label="Theme Switcher" class=".css-var-R2a.ant-float-btn-css-var.ant-float-btn.ant-float-btn-group-trigger.ant-float-btn-default.ant-float-btn-circle" type="button" left="1336" top="992" width="40" height="40">
</div>
</>

Binary file not shown.

After

Width:  |  Height:  |  Size: 281 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 482 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 358 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 200 KiB

View File

@ -69,9 +69,7 @@ describe('ai inspect element', () => {
);
}
},
{
timeout: 3 * 60 * 1000,
},
3 * 60 * 1000,
);
});
});

View File

@ -74,14 +74,12 @@ describe.skipIf(vlMode)('ai planning - by element', () => {
await resultCollector.analyze(failCaseThreshold);
await sleep(3 * 1000);
},
{
timeout: 240 * 1000,
},
240 * 1000,
);
});
});
const vlCases = ['todo-vl'];
const vlCases = ['todo-vl', 'aweme-login-vl', 'antd-form-vl'];
describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
vlCases.forEach((source) => {
@ -125,9 +123,7 @@ describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
await resultCollector.analyze(failCaseThreshold);
await sleep(3 * 1000);
},
{
timeout: 240 * 1000,
},
240 * 1000,
);
});
});

View File

@ -33,7 +33,7 @@
"build:watch": "modern build -w",
"new": "modern new",
"upgrade": "modern upgrade",
"test": "vitest --run -u",
"test": "vitest --run",
"test:ai": "AITEST=true npm run test",
"computer": "TEST_COMPUTER=true npm run test:ai -- tests/ai/evaluate/computer.test.ts",
"test:parse-action": "npm run test:ai -- tests/ai/parse-action.test.ts",

View File

@ -89,7 +89,9 @@ export async function plan(
assert(planFromAI, "can't get plans from AI");
assert(
actions.length > 0 || returnValue.finish || returnValue.sleep,
actions.length > 0 ||
!returnValue.more_actions_needed_by_instruction ||
returnValue.sleep,
`Failed to plan actions: ${planFromAI.error || '(no error details)'}`,
);

View File

@ -3,7 +3,7 @@ import { getTimeZoneInfo } from './ui-tars-planning';
export const language = getTimeZoneInfo().isChina ? 'Chinese' : 'English';
const defaultAssertionPrompt =
'You are a senior testing engineer. User will give an assertion and a screenshot of a page. Please tell whether the assertion is truthy.';
'You are a senior testing engineer. User will give an assertion and a screenshot of a page. By carefully viewing the screenshot, please tell whether the assertion is truthy.';
const defaultAssertionResponseJsonFormat = `Return in the following JSON format:
{

View File

@ -8,16 +8,18 @@ import type { ResponseFormatJSONSchema } from 'openai/resources';
import { samplePageDescription } from './util';
// Note: put the log field first to trigger the CoT
const commonOutputFields = `"log": string, // Log what the action(s) do. Use the same language as the user's instruction.
"finish": boolean, // If all the actions described in the instruction have been covered by this action and logs, set this field to true.
const commonOutputFields = `"log": string, // Log what this action(s) you just planned do. Use the same language as the user's instruction.
"more_actions_needed_by_instruction": boolean, // If all the actions described in the instruction have been covered by this action and logs, set this field to true.
"error"?: string // Error messages about unexpected situations, if any. Use the same language as the user's instruction.`;
const qwenLocateParam =
'locate: {bbox_2d: [number, number, number, number], prompt: string }';
const systemTemplateOfQwen = `
Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the NEXT action is to finish the instruction.
Don't give extra actions beyond the instruction. Don't repeat actions in the previous logs.
Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the NEXT action is to do the tasks the instruction requires.
Restriction:
- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
- Don't repeat actions in the previous logs.
Supporting actions:
- Tap: { type: "Tap", ${qwenLocateParam} }
@ -27,7 +29,8 @@ Supporting actions:
- Scroll: { type: "Scroll", ${qwenLocateParam} | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
- ExpectedFalsyCondition: { type: "ExpectedFalsyCondition", param: {reason: string} } // Use this action when the conditional statement talked about in the instruction is falsy.
The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
Field description:
* The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
Return in JSON format:
{

View File

@ -267,7 +267,7 @@ export interface PlanningAction<ParamType = any> {
export interface PlanningAIResponse {
action?: PlanningAction; // this is the qwen mode
actions?: PlanningAction[];
finish: boolean;
more_actions_needed_by_instruction: boolean;
log: string;
sleep?: number;
error?: string;

View File

@ -118,7 +118,7 @@ describe('planning', () => {
{ context },
);
expect(res.finish).toBeFalsy();
expect(res.more_actions_needed_by_instruction).toBeTruthy();
expect(res.log).toBeDefined();
});
});

View File

@ -0,0 +1,28 @@
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
exports[`Assertion prompt > return UI-Tars specific when it is UI-Tars 1`] = `
"You are a senior testing engineer. User will give an assertion and a screenshot of a page. By carefully viewing the screenshot, please tell whether the assertion is truthy.
## Output Json String Format
\`\`\`
"{
"pass": <<is a boolean value from the enum [true, false], true means the assertion is truthy>>,
"thought": "<<is a string, give the reason why the assertion is falsy or truthy. Otherwise.>>"
}"
\`\`\`
## Rules **MUST** follow
- Make sure to return **only** the JSON, with **no additional** text or explanations.
- Use English in \`thought\` part.
- You **MUST** strictly follow up the **Output Json String Format**."
`;
exports[`Assertion prompt > return default when it is not UI-Tars 1`] = `
"You are a senior testing engineer. User will give an assertion and a screenshot of a page. By carefully viewing the screenshot, please tell whether the assertion is truthy.
Return in the following JSON format:
{
pass: boolean, // whether the assertion is truthy
thought: string | null, // string, if the result is falsy, give the reason why it is falsy. Otherwise, put null.
}"
`;

View File

@ -234,8 +234,8 @@ The JSON format is as follows:
"actions": [
// ... some actions
],
"log": string, // Log what the action(s) do. Use the same language as the user's instruction.
"finish": boolean, // If all the actions described in the instruction have been covered by this action and logs, set this field to true.
"log": string, // Log what this action(s) you just planned do. Use the same language as the user's instruction.
"more_actions_needed_by_instruction": boolean, // If all the actions described in the instruction have been covered by this action and logs, set this field to true.
"error"?: string // Error messages about unexpected situations, if any. Use the same language as the user's instruction.
}
@ -318,8 +318,10 @@ THIS IS WHAT HAS BEEN DONE
exports[`system prompt to task planning > planning - qwen 1`] = `
"
Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the NEXT action is to finish the instruction.
Don't give extra actions beyond the instruction. Don't repeat actions in the previous logs.
Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the NEXT action is to do the tasks the instruction requires.
Restriction:
- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
- Don't repeat actions in the previous logs.
Supporting actions:
- Tap: { type: "Tap", locate: {bbox_2d: [number, number, number, number], prompt: string } }
@ -329,7 +331,8 @@ Supporting actions:
- Scroll: { type: "Scroll", locate: {bbox_2d: [number, number, number, number], prompt: string } | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
- ExpectedFalsyCondition: { type: "ExpectedFalsyCondition", param: {reason: string} } // Use this action when the conditional statement talked about in the instruction is falsy.
The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
Field description:
* The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
Return in JSON format:
{
@ -339,8 +342,8 @@ Return in JSON format:
},
,
"sleep"?: number, // The sleep time after the action, in milliseconds.
"log": string, // Log what the action(s) do. Use the same language as the user's instruction.
"finish": boolean, // If all the actions described in the instruction have been covered by this action and logs, set this field to true.
"log": string, // Log what this action(s) you just planned do. Use the same language as the user's instruction.
"more_actions_needed_by_instruction": boolean, // If all the actions described in the instruction have been covered by this action and logs, set this field to true.
"error"?: string // Error messages about unexpected situations, if any. Use the same language as the user's instruction.
}
"

View File

@ -4,15 +4,7 @@ import { describe, expect, it, vi } from 'vitest';
describe('Assertion prompt', () => {
it('return default when it is not UI-Tars', () => {
const prompt = systemPromptToAssert({ isUITars: false });
expect(
prompt,
).toEqual(`You are a senior testing engineer. User will give an assertion and a screenshot of a page. Please tell whether the assertion is truthy.
Return in the following JSON format:
{
pass: boolean, // whether the assertion is truthy
thought: string | null, // string, if the result is falsy, give the reason why it is falsy. Otherwise, put null.
}`);
expect(prompt).toMatchSnapshot();
});
it('return UI-Tars specific when it is UI-Tars', () => {
@ -22,21 +14,6 @@ Return in the following JSON format:
const prompt = systemPromptToAssert({ isUITars: true });
expect(
prompt,
).toEqual(`You are a senior testing engineer. User will give an assertion and a screenshot of a page. Please tell whether the assertion is truthy.
## Output Json String Format
\`\`\`
"{
"pass": <<is a boolean value from the enum [true, false], true means the assertion is truthy>>,
"thought": "<<is a string, give the reason why the assertion is falsy or truthy. Otherwise.>>"
}"
\`\`\`
## Rules **MUST** follow
- Make sure to return **only** the JSON, with **no additional** text or explanations.
- Use English in \`thought\` part.
- You **MUST** strictly follow up the **Output Json String Format**.`);
expect(prompt).toMatchSnapshot();
});
});

View File

@ -412,16 +412,20 @@ const DetailSide = (): JSX.Element => {
});
}
if (typeof (task as ExecutionTaskPlanning).output?.finish === 'boolean') {
if (
typeof (task as ExecutionTaskPlanning).output
?.more_actions_needed_by_instruction === 'boolean'
) {
timelineData.push({
color: '#06B1AB',
children: (
<>
<p>
<b>If finished</b>
<b>More actions needed</b>
</p>
<p>
{(task as ExecutionTaskPlanning).output?.finish
{(task as ExecutionTaskPlanning).output
?.more_actions_needed_by_instruction
? 'true'
: 'false'}
</p>

View File

@ -111,9 +111,7 @@
"e2e:report": "MIDSCENE_REPORT=true playwright test --config=playwright.config.ts",
"e2e:cache": "MIDSCENE_CACHE=true playwright test --config=playwright.config.ts",
"e2e:ui": "playwright test --config=playwright.config.ts --ui",
"e2e:ui:cache": "MIDSCENE_CACHE=true playwright test --config=playwright.config.ts --ui",
"e2e:generate-test-data": "GENERATE_TEST_DATA=true playwright test ./tests/ai/web/playwright/generate-test-data.spec.ts",
"e2e:generate-test-data:headed": "GENERATE_TEST_DATA=true playwright test ./tests/ai/web/playwright/generate-test-data.spec.ts --headed"
"e2e:ui:cache": "MIDSCENE_CACHE=true playwright test --config=playwright.config.ts --ui"
},
"files": ["static", "dist", "iife-script", "README.md", "bin"],
"dependencies": {

View File

@ -534,8 +534,15 @@ export class PageTaskExecutor {
}
// console.log('planResult is', planResult);
const { actions, log, finish, error, usage, rawResponse, sleep } =
planResult;
const {
actions,
log,
more_actions_needed_by_instruction,
error,
usage,
rawResponse,
sleep,
} = planResult;
let stopCollecting = false;
let bboxCollected = false;
@ -593,7 +600,7 @@ export class PageTaskExecutor {
if (finalActions.length === 0) {
assert(
finish,
!more_actions_needed_by_instruction,
error
? `Failed to plan: ${error}`
: planParsingError || 'No plan found',
@ -613,7 +620,7 @@ export class PageTaskExecutor {
return {
output: {
actions: finalActions,
finish,
more_actions_needed_by_instruction,
log,
},
cache: {
@ -699,7 +706,7 @@ export class PageTaskExecutor {
actions,
thought: actions[0]?.thought,
actionType: actions[0].type,
finish: false,
more_actions_needed_by_instruction: true,
log: '',
},
cache: {
@ -772,7 +779,7 @@ export class PageTaskExecutor {
}
// console.log('planningResult is', planResult);
if (planResult.finish) {
if (!planResult.more_actions_needed_by_instruction) {
planningTask = null;
break;
}