midscene/packages/evaluation/tests/llm-planning.test.ts

import { writeFileSync } from 'node:fs';
import {
  MIDSCENE_MODEL_NAME,
  type PlanningAIResponse,
  getAIConfig,
  plan,
} from '@midscene/core';
import { MIDSCENE_USE_QWEN_VL, getAIConfigInBoolean } from '@midscene/core/env';
import { sleep } from '@midscene/core/utils';
import { saveBase64Image } from '@midscene/shared/img';
import dotenv from 'dotenv';
import { describe, expect, test } from 'vitest';
import { TestResultCollector } from '../src/test-analyzer';
import { annotatePoints, buildContext, getCases } from './util';
dotenv.config({
  debug: true,
  override: true,
});

const failCaseThreshold = process.env.CI ? 1 : 0;
const testSources = [
  'todo',
  // 'online_order',
  // 'online_order_list',
  // 'taobao',
  // 'aweme-login',
  // 'aweme-play',
];

const vlMode = getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL);

describe.skipIf(vlMode)('ai planning - by element', () => {
  testSources.forEach((source) => {
    test(
      `${source}: planning`,
      async () => {
        const { path: aiDataPath, content: cases } = await getCases(
          source,
          'planning',
        );

        const resultCollector = new TestResultCollector(
          `${source}-planning`,
          getAIConfig(MIDSCENE_MODEL_NAME) || 'unspecified',
        );

        for (const [, testCase] of cases.testCases.entries()) {
          const context = await buildContext(source);

          const prompt = testCase.prompt;
          const startTime = Date.now();

          const res = await plan(prompt, {
            context,
          });

          if (process.env.UPDATE_ANSWER_DATA) {
            testCase.response_planning = res;
            writeFileSync(aiDataPath, JSON.stringify(cases, null, 2));
          }

          resultCollector.addResult(
            aiDataPath.split('/').pop() || '',
            testCase,
            res,
            Date.now() - startTime,
          );
        }

        await resultCollector.analyze(failCaseThreshold);
        await sleep(3 * 1000);
      },
      240 * 1000,
    );
  });
});

const vlCases = [
  'todo-vl',
  'aweme-login-vl',
  'antd-form-vl',
  'antd-tooltip-vl',
];
// const vlCases = ['aweme-login-vl'];

describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
  vlCases.forEach((source) => {
    test(
      `${source}: planning`,
      async () => {
        const { path: aiDataPath, content: cases } = await getCases(
          source,
          'planning',
        );

        const resultCollector = new TestResultCollector(
          `${source}-planning`,
          getAIConfig(MIDSCENE_MODEL_NAME) || 'unspecified',
        );

        const annotations: Array<{
          indexId: number;
          points: [number, number, number, number];
        }> = [];

        for (const [index, testCase] of cases.testCases.entries()) {
          const context = await buildContext(source.replace('-vl', ''));

          const prompt = testCase.prompt;
          const startTime = Date.now();

          let res: PlanningAIResponse | Error;
          try {
            res = await plan(prompt, {
              log: testCase.log,
              context,
            });
          } catch (error) {
            res = error as Error;
          }

          if (process.env.UPDATE_ANSWER_DATA) {
            if (res instanceof Error) {
              testCase.response_planning = {
                error: res.message,
              } as any;
            } else {
              testCase.response_planning = res;
              if (res.action?.locate?.bbox) {
                const indexId = index + 1;
                testCase.response_bbox = res.action.locate.bbox;
                testCase.annotation_index_id = indexId;
                annotations.push({
                  indexId,
                  points: res.action.locate.bbox,
                });
              }
            }
            writeFileSync(aiDataPath, JSON.stringify(cases, null, 2));
          }

          if (annotations.length > 0) {
            const markedImage = await annotatePoints(
              context.screenshotBase64,
              annotations,
            );
            await saveBase64Image({
              base64Data: markedImage,
              outputPath: `${aiDataPath}-planning-coordinates-annotated.png`,
            });
          }

          resultCollector.addResult(
            aiDataPath.split('/').pop() || '',
            testCase,
            res,
            Date.now() - startTime,
          );
        }

        await resultCollector.analyze(failCaseThreshold);
        await sleep(3 * 1000);
      },
      240 * 1000,
    );
  });
});
feat: locate by coord (#383) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-02-21 09:56:09 +08:00			`import { writeFileSync } from 'node:fs';`
			`import {`
			`MIDSCENE_MODEL_NAME,`
chore: tuning the prompt of qwen (#433) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-03-03 11:39:28 +08:00			`type PlanningAIResponse,`
feat: locate by coord (#383) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-02-21 09:56:09 +08:00			`getAIConfig,`
			`plan,`
			`} from '@midscene/core';`
fix: planning prompt (#448) * feat: add more case for llm planning * fix: ai e2e * chore: use debug to print log * chore: fix error in gpt mode 2025-03-10 16:50:43 +08:00			`import { MIDSCENE_USE_QWEN_VL, getAIConfigInBoolean } from '@midscene/core/env';`
feat: locate by coord (#383) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-02-21 09:56:09 +08:00			`import { sleep } from '@midscene/core/utils';`
fix: planning prompt (#448) * feat: add more case for llm planning * fix: ai e2e * chore: use debug to print log * chore: fix error in gpt mode 2025-03-10 16:50:43 +08:00			`import { saveBase64Image } from '@midscene/shared/img';`
feat: locate by coord (#383) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-02-21 09:56:09 +08:00			`import dotenv from 'dotenv';`
			`import { describe, expect, test } from 'vitest';`
refactor: switch bundle type to bundleless (#437) 2025-03-07 17:20:18 +08:00			`import { TestResultCollector } from '../src/test-analyzer';`
fix: planning prompt (#448) * feat: add more case for llm planning * fix: ai e2e * chore: use debug to print log * chore: fix error in gpt mode 2025-03-10 16:50:43 +08:00			`import { annotatePoints, buildContext, getCases } from './util';`
feat: locate by coord (#383) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-02-21 09:56:09 +08:00			`dotenv.config({`
			`debug: true,`
			`override: true,`
			`});`

			`const failCaseThreshold = process.env.CI ? 1 : 0;`
			`const testSources = [`
			`'todo',`
			`// 'online_order',`
			`// 'online_order_list',`
			`// 'taobao',`
			`// 'aweme-login',`
			`// 'aweme-play',`
			`];`

			`const vlMode = getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL);`

			`describe.skipIf(vlMode)('ai planning - by element', () => {`
			`testSources.forEach((source) => {`
			`test(`
			`${source}: planning`,
			`async () => {`
			`const { path: aiDataPath, content: cases } = await getCases(`
			`source,`
			`'planning',`
			`);`

			`const resultCollector = new TestResultCollector(`
			`${source}-planning`,
			`getAIConfig(MIDSCENE_MODEL_NAME) \|\| 'unspecified',`
			`);`

			`for (const [, testCase] of cases.testCases.entries()) {`
			`const context = await buildContext(source);`

			`const prompt = testCase.prompt;`
			`const startTime = Date.now();`

			`const res = await plan(prompt, {`
			`context,`
			`});`

			`if (process.env.UPDATE_ANSWER_DATA) {`
			`testCase.response_planning = res;`
			`writeFileSync(aiDataPath, JSON.stringify(cases, null, 2));`
			`}`

			`resultCollector.addResult(`
			`aiDataPath.split('/').pop() \|\| '',`
			`testCase,`
			`res,`
			`Date.now() - startTime,`
			`);`
			`}`

			`await resultCollector.analyze(failCaseThreshold);`
			`await sleep(3 * 1000);`
			`},`
fix: over planning for Qwen in page with form (#429) 2025-02-27 16:44:01 +08:00			`240 * 1000,`
feat: locate by coord (#383) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-02-21 09:56:09 +08:00			`);`
			`});`
			`});`

fix: planning prompt (#448) * feat: add more case for llm planning * fix: ai e2e * chore: use debug to print log * chore: fix error in gpt mode 2025-03-10 16:50:43 +08:00			`const vlCases = [`
			`'todo-vl',`
			`'aweme-login-vl',`
			`'antd-form-vl',`
			`'antd-tooltip-vl',`
			`];`
fix: race condition in bridge mode (#440) 2025-03-07 17:38:46 +08:00			`// const vlCases = ['aweme-login-vl'];`
feat: locate by coord (#383) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-02-21 09:56:09 +08:00
			`describe.skipIf(!vlMode)('ai planning - by coordinates', () => {`
			`vlCases.forEach((source) => {`
			`test(`
			`${source}: planning`,
			`async () => {`
			`const { path: aiDataPath, content: cases } = await getCases(`
			`source,`
			`'planning',`
			`);`

			`const resultCollector = new TestResultCollector(`
			`${source}-planning`,
			`getAIConfig(MIDSCENE_MODEL_NAME) \|\| 'unspecified',`
			`);`

fix: planning prompt (#448) * feat: add more case for llm planning * fix: ai e2e * chore: use debug to print log * chore: fix error in gpt mode 2025-03-10 16:50:43 +08:00			`const annotations: Array<{`
			`indexId: number;`
			`points: [number, number, number, number];`
			`}> = [];`

			`for (const [index, testCase] of cases.testCases.entries()) {`
feat: locate by coord (#383) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-02-21 09:56:09 +08:00			`const context = await buildContext(source.replace('-vl', ''));`

			`const prompt = testCase.prompt;`
			`const startTime = Date.now();`

chore: tuning the prompt of qwen (#433) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-03-03 11:39:28 +08:00			`let res: PlanningAIResponse \| Error;`
			`try {`
			`res = await plan(prompt, {`
			`log: testCase.log,`
			`context,`
			`});`
			`} catch (error) {`
			`res = error as Error;`
			`}`
feat: locate by coord (#383) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-02-21 09:56:09 +08:00
			`if (process.env.UPDATE_ANSWER_DATA) {`
chore: tuning the prompt of qwen (#433) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-03-03 11:39:28 +08:00			`if (res instanceof Error) {`
			`testCase.response_planning = {`
			`error: res.message,`
			`} as any;`
			`} else {`
			`testCase.response_planning = res;`
fix: planning prompt (#448) * feat: add more case for llm planning * fix: ai e2e * chore: use debug to print log * chore: fix error in gpt mode 2025-03-10 16:50:43 +08:00			`if (res.action?.locate?.bbox) {`
			`const indexId = index + 1;`
			`testCase.response_bbox = res.action.locate.bbox;`
			`testCase.annotation_index_id = indexId;`
			`annotations.push({`
			`indexId,`
			`points: res.action.locate.bbox,`
			`});`
			`}`
chore: tuning the prompt of qwen (#433) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-03-03 11:39:28 +08:00			`}`
feat: locate by coord (#383) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-02-21 09:56:09 +08:00			`writeFileSync(aiDataPath, JSON.stringify(cases, null, 2));`
			`}`

fix: planning prompt (#448) * feat: add more case for llm planning * fix: ai e2e * chore: use debug to print log * chore: fix error in gpt mode 2025-03-10 16:50:43 +08:00			`if (annotations.length > 0) {`
			`const markedImage = await annotatePoints(`
			`context.screenshotBase64,`
			`annotations,`
			`);`
			`await saveBase64Image({`
			`base64Data: markedImage,`
			outputPath: `${aiDataPath}-planning-coordinates-annotated.png`,
			`});`
			`}`

feat: locate by coord (#383) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-02-21 09:56:09 +08:00			`resultCollector.addResult(`
			`aiDataPath.split('/').pop() \|\| '',`
			`testCase,`
			`res,`
			`Date.now() - startTime,`
			`);`
			`}`

			`await resultCollector.analyze(failCaseThreshold);`
			`await sleep(3 * 1000);`
			`},`
fix: over planning for Qwen in page with form (#429) 2025-02-27 16:44:01 +08:00			`240 * 1000,`
feat: locate by coord (#383) --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com> 2025-02-21 09:56:09 +08:00			`);`
			`});`
			`});`