midscene/packages/evaluation/tests/llm-planning.test.ts

182 lines
4.9 KiB
TypeScript
Raw Permalink Normal View History

import { writeFileSync } from 'node:fs';
import {
MIDSCENE_MODEL_NAME,
type PlanningAIResponse,
type Rect,
getAIConfig,
plan,
} from '@midscene/core';
2025-03-24 19:47:19 +08:00
import { adaptBboxToRect } from '@midscene/core/ai-model';
import { sleep } from '@midscene/core/utils';
import { vlLocateMode } from '@midscene/shared/env';
import { saveBase64Image } from '@midscene/shared/img';
import dotenv from 'dotenv';
import { afterEach, describe, expect, test } from 'vitest';
import { TestResultCollector } from '../src/test-analyzer';
import { annotateRects, buildContext, getCases } from './util';
dotenv.config({
debug: true,
override: true,
});
if (process.env.MIDSCENE_EVALUATION_EXPECT_VL) {
expect(vlLocateMode()).toBeTruthy();
}
const failCaseThreshold = process.env.CI ? 2 : 0;
const testSources = ['todo'];
const vlMode = vlLocateMode();
describe.skipIf(vlMode)('ai planning - by element', () => {
testSources.forEach((source) => {
test(
`${source}: planning`,
async () => {
const { path: aiDataPath, content: cases } = await getCases(
source,
'planning',
);
2025-03-24 19:47:19 +08:00
const caseGroupName = aiDataPath.split('/').pop() || '';
const resultCollector = new TestResultCollector(
2025-03-24 19:47:19 +08:00
`${caseGroupName}-planning`,
getAIConfig(MIDSCENE_MODEL_NAME) || 'unspecified',
);
for (const [, testCase] of cases.testCases.entries()) {
const context = await buildContext(source);
const prompt = testCase.prompt;
const startTime = Date.now();
const res = await plan(prompt, {
context,
feat(web): use xpath and yaml as cache (#711) * feat(web-integration): use xpath for cache instead of id * feat(web-integration): enhance TaskCache to support xpaths for cache matching and add new test cases * feat(web-integration): add debug log for unknown page types in TaskCache * feat(web-integration): update caching logic and cache hit conditions for Plan and Locate tasks * chore(core): update debug log * feat(web-integration): update rspress.config and enhance TaskCache structure with new properties * feat(web-integration): recalculate id when hit cache * fix(web-integration): update mock implementation in task-cache test to use evaluate method * feat(web-integration): enhance element caching by adding XPath support and improving cache hit logic * chore(core): lint * feat(web-integration): improve XPath handling in web-extractor * test(web-integration): fix tests * feat(core, web-integration): add attributes to LocateResultElement and enhance element handling * fix(core): lint * feat(web-integration): add midsceneVersion to TaskCache and update cache validation logic * fix(core): test * fix(web-integration): update cache validation logic to prevent reading outdated midscene cache files * feat(web-integration): enhance TaskCache to track used cache items and improve cache retrieval logic * fix(core): xpath logic (#710) * feat(core): resue context for locate * feat(core): build yamlFlow from aiAction * feat(core): refine task-cache * feat(core): update cache * feat(core): refine task-cache * feat(core): refine task-cache * feat(core): remove unused checkElementExistsByXPath * feat(core): use yaml file as cache * chore(core): fix lint * chore(core): print warning for previous cache * refactor(core): remove quickAnswer references and improve element matching logic * fix(core): update import path for buildYamlFlowFromPlans * chore(web-integration): update output image and skip task error test * fix(web-integration): update test snapshots to handle beta versions * fix(web-integration): adjust test snapshots for version consistency * fix(web-integration): track original cache length and adjust matching logic in tests * fix(web-integration): update test URLs to reflect new target site and enable previously skipped test * chore(core): update cache docs * fix(core): test * feat(core): try to match element from plan * fix(web-integration): cache id stable when retry in palywright * fix(web-integration): typo * style(web-integration): lint * fix(web-integration): stable cacheid in tests * fix(web-integration): cache id --------- Co-authored-by: quanruzhuoxiu <quanruzhuoxiu@gmail.com>
2025-05-16 17:16:56 +08:00
pageType: 'puppeteer',
});
if (process.env.UPDATE_ANSWER_DATA) {
testCase.response_planning = res;
writeFileSync(aiDataPath, JSON.stringify(cases, null, 2));
}
resultCollector.addResult(
2025-03-24 19:47:19 +08:00
caseGroupName,
testCase,
res,
Date.now() - startTime,
);
}
await resultCollector.printSummary();
2025-03-24 19:47:19 +08:00
await resultCollector.analyze(caseGroupName, failCaseThreshold);
await sleep(3 * 1000);
},
240 * 1000,
);
});
});
const vlCases = [
'todo-vl',
'aweme-login-vl',
'antd-form-vl',
'antd-tooltip-vl',
];
const resultCollector = new TestResultCollector(
'planning',
getAIConfig(MIDSCENE_MODEL_NAME) || 'unspecified',
);
afterEach(async () => {
await resultCollector.printSummary();
});
describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
vlCases.forEach((source) => {
test(
`${source}: planning`,
async () => {
const { path: aiDataPath, content: cases } = await getCases(
source,
'planning',
);
2025-03-24 19:47:19 +08:00
const caseGroupName = aiDataPath.split('/').pop() || '';
const annotations: Array<{
indexId: number;
rect: Rect;
}> = [];
for (const [index, testCase] of cases.testCases.entries()) {
const context = await buildContext(source.replace('-vl', ''));
const prompt = testCase.prompt;
const startTime = Date.now();
let res: PlanningAIResponse | Error;
try {
res = await plan(prompt, {
log: testCase.log,
context,
2025-04-02 20:34:23 +08:00
actionContext: testCase.action_context,
feat(web): use xpath and yaml as cache (#711) * feat(web-integration): use xpath for cache instead of id * feat(web-integration): enhance TaskCache to support xpaths for cache matching and add new test cases * feat(web-integration): add debug log for unknown page types in TaskCache * feat(web-integration): update caching logic and cache hit conditions for Plan and Locate tasks * chore(core): update debug log * feat(web-integration): update rspress.config and enhance TaskCache structure with new properties * feat(web-integration): recalculate id when hit cache * fix(web-integration): update mock implementation in task-cache test to use evaluate method * feat(web-integration): enhance element caching by adding XPath support and improving cache hit logic * chore(core): lint * feat(web-integration): improve XPath handling in web-extractor * test(web-integration): fix tests * feat(core, web-integration): add attributes to LocateResultElement and enhance element handling * fix(core): lint * feat(web-integration): add midsceneVersion to TaskCache and update cache validation logic * fix(core): test * fix(web-integration): update cache validation logic to prevent reading outdated midscene cache files * feat(web-integration): enhance TaskCache to track used cache items and improve cache retrieval logic * fix(core): xpath logic (#710) * feat(core): resue context for locate * feat(core): build yamlFlow from aiAction * feat(core): refine task-cache * feat(core): update cache * feat(core): refine task-cache * feat(core): refine task-cache * feat(core): remove unused checkElementExistsByXPath * feat(core): use yaml file as cache * chore(core): fix lint * chore(core): print warning for previous cache * refactor(core): remove quickAnswer references and improve element matching logic * fix(core): update import path for buildYamlFlowFromPlans * chore(web-integration): update output image and skip task error test * fix(web-integration): update test snapshots to handle beta versions * fix(web-integration): adjust test snapshots for version consistency * fix(web-integration): track original cache length and adjust matching logic in tests * fix(web-integration): update test URLs to reflect new target site and enable previously skipped test * chore(core): update cache docs * fix(core): test * feat(core): try to match element from plan * fix(web-integration): cache id stable when retry in palywright * fix(web-integration): typo * style(web-integration): lint * fix(web-integration): stable cacheid in tests * fix(web-integration): cache id --------- Co-authored-by: quanruzhuoxiu <quanruzhuoxiu@gmail.com>
2025-05-16 17:16:56 +08:00
pageType: 'puppeteer',
});
} catch (error) {
res = error as Error;
}
if (process.env.UPDATE_ANSWER_DATA) {
if (res instanceof Error) {
testCase.response_planning = {
error: res.message,
} as any;
} else {
testCase.response_planning = res;
if (res.action?.locate?.bbox) {
const indexId = index + 1;
2025-03-24 19:47:19 +08:00
testCase.response_rect = adaptBboxToRect(
res.action.locate.bbox,
context.size.width,
context.size.height,
);
testCase.annotation_index_id = indexId;
annotations.push({
indexId,
rect: testCase.response_rect,
});
}
}
writeFileSync(aiDataPath, JSON.stringify(cases, null, 2));
}
if (annotations.length > 0) {
const markedImage = await annotateRects(
context.screenshotBase64,
annotations.map((item) => item.rect),
);
await saveBase64Image({
base64Data: markedImage,
outputPath: `${aiDataPath}-planning-coordinates-annotated.png`,
});
}
resultCollector.addResult(
2025-03-24 19:47:19 +08:00
caseGroupName,
testCase,
res,
Date.now() - startTime,
);
}
2025-03-24 19:47:19 +08:00
await resultCollector.analyze(caseGroupName, failCaseThreshold);
await sleep(3 * 1000);
},
240 * 1000,
);
});
});