From ee521e31e0fabe5c5a7dd338fc35ca35178b770c Mon Sep 17 00:00:00 2001 From: yuyutaotao <167746126+yuyutaotao@users.noreply.github.com> Date: Thu, 17 Oct 2024 10:44:30 +0800 Subject: [PATCH] fix: fix the Player style when the locator is failed (#129) * fix: fix the Player style when the locator is failed * fix: add marker image into UIContext --- packages/midscene/src/ai-model/inspect.ts | 4 +- .../midscene/src/ai-model/prompt/planning.ts | 19 +++++-- packages/midscene/src/types.ts | 2 + packages/midscene/src/utils.ts | 2 +- packages/visualizer/scripts/build-html.ts | 2 +- .../visualizer/src/component/blackboard.tsx | 54 ++++++++++++------- packages/visualizer/src/component/player.less | 5 +- packages/visualizer/src/component/player.tsx | 8 ++- .../src/component/replay-scripts.tsx | 19 ++++++- packages/visualizer/src/component/store.tsx | 16 +++--- packages/web-integration/src/common/utils.ts | 5 +- .../src/extractor/client-extractor.ts | 1 + .../tests/ai/web/puppeteer/showcase.test.ts | 4 +- 13 files changed, 98 insertions(+), 43 deletions(-) diff --git a/packages/midscene/src/ai-model/inspect.ts b/packages/midscene/src/ai-model/inspect.ts index 16de541f5..39909ce9b 100644 --- a/packages/midscene/src/ai-model/inspect.ts +++ b/packages/midscene/src/ai-model/inspect.ts @@ -39,7 +39,7 @@ export async function AiInspectElement< }) { const { context, multi, targetElementDescription, callAI, useModel } = options; - const { screenshotBase64 } = context; + const { screenshotBase64, screenshotBase64WithElementMarker } = context; const { description, elementById } = await describeUserPage(context); // meet quick answer @@ -61,7 +61,7 @@ export async function AiInspectElement< { type: 'image_url', image_url: { - url: screenshotBase64, + url: screenshotBase64WithElementMarker || screenshotBase64, }, }, { diff --git a/packages/midscene/src/ai-model/prompt/planning.ts b/packages/midscene/src/ai-model/prompt/planning.ts index e9a8d11a8..8525df947 100644 --- a/packages/midscene/src/ai-model/prompt/planning.ts +++ b/packages/midscene/src/ai-model/prompt/planning.ts @@ -42,16 +42,16 @@ Remember: If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query. -## Objective 2 (sub objective): Give a quick answer to the action with type "Locate" you just planned +## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field after the \`param\` field -Review the action you just planned. If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON: +If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON: { "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button", "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty "id": "wefew2222few2" // id of this element, replace with actual value in practice } -If the action type is not 'Locate', or there is no element meets the description in the prompt (usually because it will show up after some interaction), the answer should be null. +If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null. ## Output JSON Format: @@ -65,7 +65,7 @@ Please return the result in JSON format as follows: "param": { "prompt": "The search bar" }, - "quickAnswer": { // since the first action is Locate, so we need to give a quick answer + "quickAnswer": { // since this action type is 'Locate', and we can find the element, so we need to give a quick answer "reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar", "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty "id": "wefew2222few2" // ID of this element, replace with actual value in practice @@ -76,6 +76,14 @@ Please return the result in JSON format as follows: "type": "Tap", // Type of action, like 'Tap' 'Hover' ... "param": any, // Parameter towards the task type }, + { + "thought": "Reasons for generating this task, and why this task is feasible on this page", + "type": "Locate", // Type of action, like 'Tap' 'Hover' ... + "param": { + "prompt": "The search bar" + }, + "quickAnswer": null, + }, // ... more actions ], error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here, @@ -111,7 +119,8 @@ export const planSchema: ResponseFormatJSONSchema = { }, param: { type: ['object', 'null'], - description: 'Parameter towards the task type, can be null', + description: + 'Parameter towards the task type, can be null only when the type field is Tap or Hover', }, quickAnswer: { type: ['object', 'null'], diff --git a/packages/midscene/src/types.ts b/packages/midscene/src/types.ts index d5e12e345..fef2c9c7a 100644 --- a/packages/midscene/src/types.ts +++ b/packages/midscene/src/types.ts @@ -81,6 +81,8 @@ export interface AIAssertionResponse { export abstract class UIContext { abstract screenshotBase64: string; + abstract screenshotBase64WithElementMarker?: string; + abstract content: ElementType[]; abstract size: Size; diff --git a/packages/midscene/src/utils.ts b/packages/midscene/src/utils.ts index fd9c5f36e..58f8de56b 100644 --- a/packages/midscene/src/utils.ts +++ b/packages/midscene/src/utils.ts @@ -57,7 +57,7 @@ export function writeDumpReport( const attributesArr = Object.keys(attributes || {}).map((key) => { return `${key}="${encodeURIComponent(attributes![key])}"`; }); - return ``; + return ``; }); reportContent = tpl.replace('{{dump}}', dumps.join('\n')); } diff --git a/packages/visualizer/scripts/build-html.ts b/packages/visualizer/scripts/build-html.ts index e75399242..f51ff6f0e 100644 --- a/packages/visualizer/scripts/build-html.ts +++ b/packages/visualizer/scripts/build-html.ts @@ -72,7 +72,7 @@ function build() { const resultWithDemo = tplReplacer(html, { css: `\n`, js: ``, - dump: ``, + dump: ``, }); writeFileSync(outputDemoHTML, resultWithDemo); console.log(`HTML file generated successfully: ${outputDemoHTML}`); diff --git a/packages/visualizer/src/component/blackboard.tsx b/packages/visualizer/src/component/blackboard.tsx index 192cb466c..70aebec62 100644 --- a/packages/visualizer/src/component/blackboard.tsx +++ b/packages/visualizer/src/component/blackboard.tsx @@ -11,8 +11,6 @@ import { useBlackboardPreference, useInsightDump } from './store'; const itemFillAlpha = 0.4; const highlightAlpha = 0.4; -const bgOnAlpha = 1; -const bgOffAlpha = 0.3; const noop = () => { // noop }; @@ -70,7 +68,7 @@ const BlackBoard = (): JSX.Element => { const highlightIds = highlightElements.map((e) => e.id); const { context } = dump!; - const { size, screenshotBase64 } = context; + const { size, screenshotBase64, screenshotBase64WithElementMarker } = context; const screenWidth = size.width; const screenHeight = size.height; @@ -84,9 +82,11 @@ const BlackBoard = (): JSX.Element => { // key overlays const pixiBgRef = useRef(); - const { bgVisible, setBgVisible, elementsVisible, setTextsVisible } = + const { markerVisible, setMarkerVisible, elementsVisible, setTextsVisible } = useBlackboardPreference(); + const ifMarkerAvailable = !!screenshotBase64WithElementMarker; + useEffect(() => { Promise.resolve( (async () => { @@ -139,14 +139,28 @@ const BlackBoard = (): JSX.Element => { img.onload = () => { if (!app.stage) return; const screenshotTexture = PIXI.Texture.from(img); - const screenshotSprite = new PIXI.Sprite(screenshotTexture); - screenshotSprite.x = 0; - screenshotSprite.y = 0; - screenshotSprite.width = screenWidth; - screenshotSprite.height = screenHeight; - app.stage.addChildAt(screenshotSprite, 0); - pixiBgRef.current = screenshotSprite; - screenshotSprite.alpha = bgVisible ? bgOnAlpha : bgOffAlpha; + const backgroundSprite = new PIXI.Sprite(screenshotTexture); + backgroundSprite.x = 0; + backgroundSprite.y = 0; + backgroundSprite.width = screenWidth; + backgroundSprite.height = screenHeight; + app.stage.addChildAt(backgroundSprite, 0); + + if (ifMarkerAvailable) { + const markerImg = new Image(); + markerImg.src = screenshotBase64WithElementMarker; + markerImg.onload = () => { + const markerTexture = PIXI.Texture.from(markerImg); + const markerSprite = new PIXI.Sprite(markerTexture); + markerSprite.x = 0; + markerSprite.y = 0; + markerSprite.width = screenWidth; + markerSprite.height = screenHeight; + app.stage.addChildAt(markerSprite, 1); + pixiBgRef.current = markerSprite; + markerSprite.visible = markerVisible; + }; + } }; }, [app.stage, appInitialed]); @@ -156,7 +170,7 @@ const BlackBoard = (): JSX.Element => { highlightContainer.removeChildren(); elementMarkContainer.removeChildren(); - // element mark + // element rects context.content.forEach((element) => { const { rect, content, id } = element; const ifHighlight = highlightIds.includes(id); @@ -198,10 +212,10 @@ const BlackBoard = (): JSX.Element => { // elementsVisible, ]); - const onSetBg: CheckboxProps['onChange'] = (e) => { - setBgVisible(e.target.checked); + const onSetMarkerVisible: CheckboxProps['onChange'] = (e) => { + setMarkerVisible(e.target.checked); if (pixiBgRef.current) { - pixiBgRef.current.alpha = e.target.checked ? bgOnAlpha : bgOffAlpha; + pixiBgRef.current.visible = e.target.checked; } }; @@ -238,8 +252,12 @@ const BlackBoard = (): JSX.Element => { />
- - Screenshot + + Marker Elements diff --git a/packages/visualizer/src/component/player.less b/packages/visualizer/src/component/player.less index a7b731b53..b093a12f6 100644 --- a/packages/visualizer/src/component/player.less +++ b/packages/visualizer/src/component/player.less @@ -9,7 +9,7 @@ width: fit-content; max-width: 100%; max-height: 100%; - padding: @player-spacing; + padding: @player-spacing 0; padding-bottom: 0; background: #434443DD; box-sizing: border-box; @@ -27,7 +27,7 @@ align-items: center; justify-content: center; overflow: hidden; - + padding: 0 @player-spacing; canvas { max-width: 100%; max-height: 100%; @@ -65,6 +65,7 @@ display: flex; flex-direction: row; flex-shrink: 0; + padding: 0 @player-spacing; .status-icon { transition: .2s; diff --git a/packages/visualizer/src/component/player.tsx b/packages/visualizer/src/component/player.tsx index 049447d33..326199a51 100644 --- a/packages/visualizer/src/component/player.tsx +++ b/packages/visualizer/src/component/player.tsx @@ -545,7 +545,8 @@ const Player = (): JSX.Element => { return acc + item.duration + (item.insightCameraDuration || 0); }, 0); - const progressUpdateInterval = 300; + // progress bar + const progressUpdateInterval = 200; const startTime = performance.now(); setAnimationProgress(0); const updateProgress = () => { @@ -553,8 +554,11 @@ const Player = (): JSX.Element => { (performance.now() - startTime) / totalDuration, 1, ); + setAnimationProgress(progress); - return timeout(updateProgress, progressUpdateInterval); + if (progress < 1) { + return timeout(updateProgress, progressUpdateInterval); + } }; frame(updateProgress); diff --git a/packages/visualizer/src/component/replay-scripts.tsx b/packages/visualizer/src/component/replay-scripts.tsx index 44ac10ddf..e46f8be47 100644 --- a/packages/visualizer/src/component/replay-scripts.tsx +++ b/packages/visualizer/src/component/replay-scripts.tsx @@ -40,6 +40,7 @@ export interface AnimationScript { } const stillDuration = 1200; +const stillAfterInsightDuration = 300; const locateDuration = 800; const actionDuration = 1000; const clearInsightDuration = 200; @@ -185,9 +186,23 @@ export const generateAnimationScripts = ( throw new Error('insight dump is required'); } const insightContentLength = insightDump.context.content.length; + + if (insightDump.context.screenshotBase64WithElementMarker) { + // show the original screenshot first + scripts.push({ + type: 'img', + img: insightDump.context.screenshotBase64, + duration: stillAfterInsightDuration, + title, + subTitle, + }); + } + scripts.push({ type: 'insight', - img: insightDump.context.screenshotBase64, + img: + insightDump.context.screenshotBase64WithElementMarker || + insightDump.context.screenshotBase64, insightDump: insightDump, camera: currentCameraState === fullPageCameraState || !insightCameraState @@ -202,7 +217,7 @@ export const generateAnimationScripts = ( scripts.push({ type: 'sleep', - duration: 800, + duration: stillAfterInsightDuration, title, subTitle, }); diff --git a/packages/visualizer/src/component/store.tsx b/packages/visualizer/src/component/store.tsx index 469de8452..965889e6f 100644 --- a/packages/visualizer/src/component/store.tsx +++ b/packages/visualizer/src/component/store.tsx @@ -13,15 +13,15 @@ import { generateAnimationScripts } from './replay-scripts'; const { create } = Z; export const useBlackboardPreference = create<{ - bgVisible: boolean; + markerVisible: boolean; elementsVisible: boolean; - setBgVisible: (visible: boolean) => void; + setMarkerVisible: (visible: boolean) => void; setTextsVisible: (visible: boolean) => void; }>((set) => ({ - bgVisible: true, + markerVisible: true, elementsVisible: true, - setBgVisible: (visible: boolean) => { - set({ bgVisible: visible }); + setMarkerVisible: (visible: boolean) => { + set({ markerVisible: visible }); }, setTextsVisible: (visible: boolean) => { set({ elementsVisible: visible }); @@ -126,8 +126,10 @@ export const useExecutionDump = create<{ execution.tasks.forEach((task) => { if (task.type === 'Insight') { const insightTask = task as ExecutionTaskInsightLocate; - width = insightTask.log?.dump?.context?.size?.width || 1920; - height = insightTask.log?.dump?.context?.size?.height || 1080; + if (insightTask.log?.dump?.context?.size?.width) { + width = insightTask.log?.dump?.context?.size?.width; + height = insightTask.log?.dump?.context?.size?.height; + } } }); }); diff --git a/packages/web-integration/src/common/utils.ts b/packages/web-integration/src/common/utils.ts index dd1b61faa..7ab6c7f8d 100644 --- a/packages/web-integration/src/common/utils.ts +++ b/packages/web-integration/src/common/utils.ts @@ -44,7 +44,7 @@ export async function parseContextFromWebPage( const size = await imageInfoOfBase64(screenshotBase64); // composite element infos to screenshot - const screenshotBase64WithElementInfos = await compositeElementInfoImg({ + const screenshotBase64WithElementMarker = await compositeElementInfoImg({ inputImgBase64: screenshotBase64.split(';base64,').pop() as string, elementsPositionInfo: elementsPositionInfoWithoutText, }); @@ -52,7 +52,8 @@ export async function parseContextFromWebPage( return { content: elementsInfo, size, - screenshotBase64: `data:image/png;base64,${screenshotBase64WithElementInfos}`, + screenshotBase64, + screenshotBase64WithElementMarker: `data:image/png;base64,${screenshotBase64WithElementMarker}`, url, }; } diff --git a/packages/web-integration/src/extractor/client-extractor.ts b/packages/web-integration/src/extractor/client-extractor.ts index 35cf0142f..b5e5af29b 100644 --- a/packages/web-integration/src/extractor/client-extractor.ts +++ b/packages/web-integration/src/extractor/client-extractor.ts @@ -157,6 +157,7 @@ export function extractTextWithPosition(initNode: Document): ElementInfo[] { nodeType = NodeType.BUTTON; break; case 'SEARCHINPUT': + case 'TEXTINPUT': case 'INPUT': nodeType = NodeType.FORM_ITEM; break; diff --git a/packages/web-integration/tests/ai/web/puppeteer/showcase.test.ts b/packages/web-integration/tests/ai/web/puppeteer/showcase.test.ts index 0128f66dc..25b719fb9 100644 --- a/packages/web-integration/tests/ai/web/puppeteer/showcase.test.ts +++ b/packages/web-integration/tests/ai/web/puppeteer/showcase.test.ts @@ -58,9 +58,11 @@ describe( ); const mid = new PuppeteerAgent(originPage); + await mid.aiAction('Click the password input on page'); + await mid.aiAction('scroll down two screen'); - const widgets = await mid.aiQuery( + await mid.aiQuery( 'find all inputs in the page, return the field name in string[]', );