fix(report): reduce context size in report file (#626)

* fix(core): reduce context size in report file

* chore(core): fix lint

* chore(core): resolve conflict

---------

Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com>
This commit is contained in:
yuyutaotao 2025-04-24 18:28:45 +08:00 committed by GitHub
parent ce7929bbbc
commit ecefd8b0fa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 108 additions and 114 deletions

View File

@ -5,7 +5,7 @@ import { pluginLess } from '@rsbuild/plugin-less';
import { pluginNodePolyfill } from '@rsbuild/plugin-node-polyfill';
import { pluginReact } from '@rsbuild/plugin-react';
const testDataPath = path.join(__dirname, 'test-data', 'online-order.json');
const testDataPath = path.join(__dirname, 'test-data', 'swag-lab.json');
const testData = JSON.parse(fs.readFileSync(testDataPath, 'utf-8'));
const copyReportTemplate = () => ({

View File

@ -33,12 +33,12 @@ const VIEW_TYPE_JSON = 'json';
const DetailPanel = (): JSX.Element => {
const insightDump = useExecutionDump((store) => store.insightDump);
const dumpId = useExecutionDump((store) => store._insightDumpLoadId);
const blackboardViewAvailable = Boolean(insightDump);
const activeExecution = useExecutionDump((store) => store.activeExecution);
const activeExecutionId = useExecutionDump(
(store) => store._executionDumpLoadId,
);
const activeTask = useExecutionDump((store) => store.activeTask);
const blackboardViewAvailable = Boolean(activeTask?.pageContext);
const [preferredViewType, setViewType] = useState(VIEW_TYPE_REPLAY);
const animationScripts = useExecutionDump(
(store) => store.activeExecutionAnimation,
@ -89,7 +89,7 @@ const DetailPanel = (): JSX.Element => {
if (blackboardViewAvailable) {
content = (
<Blackboard
uiContext={insightDump!.context}
uiContext={activeTask.pageContext}
highlightElements={insightDump!.matchedElement}
highlightRect={insightDump!.taskInfo?.searchArea}
key={`${dumpId}`}

File diff suppressed because one or more lines are too long

View File

@ -1,4 +1,10 @@
import type { AIUsageInfo, Rect, Size } from '@/types';
import type {
AIUsageInfo,
BaseElement,
ElementTreeNode,
Rect,
Size,
} from '@/types';
import { assert } from '@midscene/shared/utils';
import type {
@ -13,6 +19,9 @@ import {
import { vlLocateMode } from '@/env';
import type { PlanningLocateParam } from '@/types';
import { NodeType } from '@midscene/shared/constants';
import { treeToList } from '@midscene/shared/extractor';
import { compositeElementInfoImg } from '@midscene/shared/img';
import { getDebug } from '@midscene/shared/logger';
export type AIArgs = [
@ -261,3 +270,26 @@ export function expandSearchArea(rect: Rect, screenSize: Size) {
);
return rect;
}
export async function markupImageForLLM(
screenshotBase64: string,
tree: ElementTreeNode<BaseElement>,
size: Size,
) {
const elementsInfo = treeToList(tree);
const elementsPositionInfoWithoutText = elementsInfo!.filter(
(elementInfo) => {
if (elementInfo.attributes.nodeType === NodeType.TEXT) {
return false;
}
return true;
},
);
const imagePayload = await compositeElementInfoImg({
inputImgBase64: screenshotBase64,
elementsPositionInfo: elementsPositionInfoWithoutText as any,
size,
});
return imagePayload;
}

View File

@ -32,6 +32,7 @@ import {
adaptBboxToRect,
callAiFn,
expandSearchArea,
markupImageForLLM,
mergeRects,
} from './common';
import { systemPromptToAssert } from './prompt/assertion';
@ -128,7 +129,7 @@ export async function AiLocateElement<
usage?: AIUsageInfo;
}> {
const { context, targetElementDescription, callAI } = options;
const { screenshotBase64, screenshotBase64WithElementMarker } = context;
const { screenshotBase64 } = context;
const { description, elementById, insertElementByPosition, size } =
await describeUserPage(context);
// meet quick answer
@ -153,7 +154,7 @@ export async function AiLocateElement<
});
const systemPrompt = systemPromptToLocateElement(vlLocateMode());
let imagePayload = screenshotBase64WithElementMarker || screenshotBase64;
let imagePayload = screenshotBase64;
if (options.searchConfig) {
assert(
@ -166,8 +167,14 @@ export async function AiLocateElement<
);
imagePayload = options.searchConfig.imageBase64;
} else if (getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL)) {
} else if (vlLocateMode() === 'qwen-vl') {
imagePayload = await paddingToMatchBlockByBase64(imagePayload);
} else if (!vlLocateMode()) {
imagePayload = await markupImageForLLM(
screenshotBase64,
context.tree,
context.size,
);
}
const msgs: AIArgs = [

View File

@ -7,6 +7,7 @@ import {
type AIArgs,
callAiFn,
fillLocateParam,
markupImageForLLM,
warnGPT4oSizeLimit,
} from './common';
import {
@ -27,7 +28,7 @@ export async function plan(
},
): Promise<PlanningAIResponse> {
const { callAI, context } = opts || {};
const { screenshotBase64, screenshotBase64WithElementMarker, size } = context;
const { screenshotBase64, size } = context;
const { description: pageDescription } = await describeUserPage(context);
const systemPrompt = await systemPromptToTaskPlanning({
@ -46,9 +47,15 @@ export async function plan(
taskBackgroundContext: taskBackgroundContextText,
});
let imagePayload = screenshotBase64WithElementMarker || screenshotBase64;
let imagePayload = screenshotBase64;
if (vlLocateMode() === 'qwen-vl') {
imagePayload = await paddingToMatchBlockByBase64(imagePayload);
} else if (!vlLocateMode()) {
imagePayload = await markupImageForLLM(
screenshotBase64,
context.tree,
context.size,
);
}
warnGPT4oSizeLimit(size);

View File

@ -340,7 +340,7 @@ export async function callToGetJSONObject<T>(
const model = getModelName();
if (model.includes('gpt-4o') || model.includes('gpt-4.1')) {
if (model.includes('gpt-4')) {
switch (AIActionTypeValue) {
case AIActionType.ASSERT:
responseFormat = assertSchema;

View File

@ -158,7 +158,6 @@ export default class Insight<
const dumpData: PartialInsightDumpFromSDK = {
type: 'locate',
context,
userQuery: {
element: queryPrompt,
},
@ -256,7 +255,6 @@ export default class Insight<
const dumpData: PartialInsightDumpFromSDK = {
type: 'extract',
context,
userQuery: {
dataDemand,
},
@ -314,7 +312,6 @@ export default class Insight<
const { thought, pass } = assertResult.content;
const dumpData: PartialInsightDumpFromSDK = {
type: 'assert',
context,
userQuery: {
assertion,
},

View File

@ -119,8 +119,6 @@ export interface AIAssertionResponse {
export abstract class UIContext<ElementType extends BaseElement = BaseElement> {
abstract screenshotBase64: string;
abstract screenshotBase64WithElementMarker?: string;
// @deprecated('use tree instead')
abstract content: ElementType[];
@ -193,7 +191,6 @@ export interface ReportDumpWithAttributes {
export interface InsightDump extends DumpMeta {
type: 'locate' | 'extract' | 'assert';
logId: string;
context: UIContext;
userQuery: {
element?: string;
dataDemand?: InsightExtractParam;

View File

@ -146,6 +146,14 @@ export function writeDumpReport(
return null;
}
writeFileSync(reportPath, reportContent);
if (process.env.MIDSCENE_DEBUG_LOG_JSON) {
writeFileSync(
`${reportPath}.json`,
typeof dumpData === 'string'
? dumpData
: JSON.stringify(dumpData, null, 2),
);
}
return reportPath;
}

View File

@ -1,27 +1,28 @@
import assert from 'node:assert';
import type Jimp from 'jimp';
import type { NodeType } from '../constants';
import type { Rect } from '../types';
import type { BaseElement } from '../types';
import getJimp from './get-jimp';
import { bufferFromBase64, imageInfoOfBase64 } from './index';
// Define picture path
type ElementType = {
locator?: string;
rect: Rect;
center?: [number, number];
id?: string;
indexId: number;
attributes?: {
nodeType: NodeType;
[key: string]: string;
};
};
let cachedFont: any = null;
const loadFonts = async () => {
const Jimp = await getJimp();
try {
const fonts = await Jimp.loadFont(Jimp.FONT_SANS_16_WHITE);
return fonts;
} catch (error) {
console.warn('Error loading font, will try to load online fonts', error);
const onlineFonts =
'https://cdn.jsdelivr.net/npm/jimp-compact@0.16.1-2/fonts/open-sans/open-sans-16-white/open-sans-16-white.fnt';
const fonts = await Jimp.loadFont(onlineFonts);
return fonts;
}
};
const createSvgOverlay = async (
elements: Array<ElementType>,
elements: Array<BaseElement>,
imageWidth: number,
imageHeight: number,
boxPadding = 5,
@ -78,7 +79,11 @@ const createSvgOverlay = async (
);
// Calculate text position
const textWidth = element.indexId.toString().length * 8;
const indexId = element.indexId;
if (typeof indexId !== 'number') {
continue;
}
const textWidth = indexId.toString().length * 8;
const textHeight = 12;
const rectWidth = textWidth + 5;
const rectHeight = textHeight + 4;
@ -164,7 +169,7 @@ const createSvgOverlay = async (
);
// Draw text (simplified, as Jimp doesn't have built-in text drawing)
try {
cachedFont = cachedFont || (await Jimp.loadFont(Jimp.FONT_SANS_16_WHITE));
cachedFont = cachedFont || (await loadFonts());
} catch (error) {
console.error('Error loading font', error);
}
@ -173,7 +178,7 @@ const createSvgOverlay = async (
rectX,
rectY,
{
text: element.indexId.toString(),
text: indexId.toString(),
alignmentX: Jimp.HORIZONTAL_ALIGN_CENTER,
alignmentY: Jimp.VERTICAL_ALIGN_MIDDLE,
},
@ -187,7 +192,7 @@ const createSvgOverlay = async (
export const compositeElementInfoImg = async (options: {
inputImgBase64: string;
elementsPositionInfo: Array<ElementType>;
elementsPositionInfo: Array<BaseElement>;
size?: { width: number; height: number };
annotationPadding?: number;
}) => {
@ -255,8 +260,8 @@ export const compositeElementInfoImg = async (options: {
export const processImageElementInfo = async (options: {
inputImgBase64: string;
elementsPositionInfo: Array<ElementType>;
elementsPositionInfoWithoutText: Array<ElementType>;
elementsPositionInfo: Array<BaseElement>;
elementsPositionInfoWithoutText: Array<BaseElement>;
}) => {
// Get the size of the original image
const base64Image = options.inputImgBase64.split(';base64,').pop();

View File

@ -69,7 +69,7 @@ export const Blackboard = (props: {
const highlightRect = props.highlightRect;
const context = props.uiContext!;
const { size, screenshotBase64, screenshotBase64WithElementMarker } = context;
const { size, screenshotBase64 } = context;
const screenWidth = size.width;
const screenHeight = size.height;
@ -88,8 +88,6 @@ export const Blackboard = (props: {
const { markerVisible, setMarkerVisible, elementsVisible, setTextsVisible } =
useBlackboardPreference();
const ifMarkerAvailable = !!screenshotBase64WithElementMarker;
useEffect(() => {
Promise.resolve(
(async () => {
@ -147,25 +145,6 @@ export const Blackboard = (props: {
backgroundSprite.width = screenWidth;
backgroundSprite.height = screenHeight;
app.stage.addChildAt(backgroundSprite, 0);
if (ifMarkerAvailable) {
const markerImg = new Image();
markerImg.onload = () => {
const markerTexture = PIXI.Texture.from(markerImg);
const markerSprite = new PIXI.Sprite(markerTexture);
markerSprite.x = 0;
markerSprite.y = 0;
markerSprite.width = screenWidth;
markerSprite.height = screenHeight;
app.stage.addChildAt(markerSprite, 1);
pixiBgRef.current = markerSprite;
markerSprite.visible = markerVisible;
};
markerImg.onerror = (e) => {
console.error('load marker failed', e);
};
markerImg.src = screenshotBase64WithElementMarker;
}
};
img.onerror = (e) => {
console.error('load screenshot failed', e);
@ -268,13 +247,6 @@ export const Blackboard = (props: {
style={{ display: props.hideController ? 'none' : 'block' }}
>
<div className="overlay-control">
<Checkbox
checked={markerVisible}
onChange={onSetMarkerVisible}
disabled={!ifMarkerAvailable}
>
Marker
</Checkbox>
<Checkbox checked={elementsVisible} onChange={onSetElementsVisible}>
Elements
</Checkbox>

View File

@ -619,7 +619,7 @@ export function Player(props?: {
currentImg.current = item.img;
await repaintImage();
const elements = item.insightDump.context.content;
const elements = item.context?.content || [];
const highlightElements = item.insightDump.matchedElement;
await insightElementsAnimation(
elements,

View File

@ -6,12 +6,12 @@ import { paramStr, typeStr } from '@midscene/web/ui-utils';
import type {
ExecutionDump,
ExecutionTask,
ExecutionTaskApply,
ExecutionTaskInsightLocate,
ExecutionTaskPlanning,
GroupedActionDump,
InsightDump,
Rect,
UIContext,
} from '@midscene/core';
export interface CameraState {
@ -39,6 +39,7 @@ export interface AnimationScript {
img?: string;
camera?: TargetCameraState;
insightDump?: InsightDump;
context?: UIContext;
duration: number;
insightCameraDuration?: number;
title?: string;
@ -280,7 +281,7 @@ export const generateAnimationScripts = (
});
initSubTitle = paramStr(task);
}
} else if (task.type === 'Insight') {
} else if (task.type === 'Insight' && task.subType === 'Locate') {
const insightTask = task as ExecutionTaskInsightLocate;
const resultElement = insightTask.output?.element;
const title = typeStr(task);
@ -292,18 +293,16 @@ export const generateAnimationScripts = (
pointerTop: resultElement.center[1],
};
}
if (insightTask.log?.dump) {
const context = insightTask.pageContext;
if (insightTask.log?.dump && context?.screenshotBase64) {
const insightDump = insightTask.log.dump;
if (!insightDump?.context?.screenshotBase64) {
throw new Error('insight dump is required');
}
const insightContentLength = insightDump.context.content.length;
const insightContentLength = context.content.length;
if (insightDump.context.screenshotBase64WithElementMarker) {
if (context.screenshotBase64) {
// show the original screenshot first
scripts.push({
type: 'img',
img: insightDump.context.screenshotBase64,
img: context.screenshotBase64,
duration: stillAfterInsightDuration,
title,
subTitle,
@ -324,9 +323,8 @@ export const generateAnimationScripts = (
scripts.push({
type: 'insight',
img:
insightDump.context.screenshotBase64WithElementMarker ||
insightDump.context.screenshotBase64,
img: context.screenshotBase64,
context: context,
insightDump: insightDump,
camera: cameraState,
duration:
@ -435,7 +433,7 @@ export const generateAnimationScripts = (
});
}
// console.log('replayscripts');
// console.log('replay scripts');
// console.log(scripts, tasksIncluded);
return scripts;

View File

@ -4,17 +4,11 @@ import type {
PlaywrightParserOpt,
UIContext,
} from '@midscene/core';
import {
MIDSCENE_REPORT_TAG_NAME,
MIDSCENE_USE_VLM_UI_TARS,
getAIConfig,
getAIConfigInBoolean,
} from '@midscene/core/env';
import { MIDSCENE_REPORT_TAG_NAME, getAIConfig } from '@midscene/core/env';
import { uploadTestInfoToServer } from '@midscene/core/utils';
import { NodeType } from '@midscene/shared/constants';
import type { ElementInfo } from '@midscene/shared/extractor';
import { traverseTree, treeToList } from '@midscene/shared/extractor';
import { compositeElementInfoImg, resizeImgBase64 } from '@midscene/shared/img';
import { resizeImgBase64 } from '@midscene/shared/img';
import { assert, logMsg, uuid } from '@midscene/shared/utils';
import dayjs from 'dayjs';
import { WebElementInfo } from '../web-element';
@ -58,19 +52,9 @@ export async function parseContextFromWebPage(
});
});
const elementsInfo = treeToList(webTree);
assert(screenshotBase64!, 'screenshotBase64 is required');
const elementsPositionInfoWithoutText = elementsInfo!.filter(
(elementInfo) => {
if (elementInfo.attributes.nodeType === NodeType.TEXT) {
return false;
}
return true;
},
);
const elementsInfo = treeToList(webTree);
const size = await page.size();
if (size.dpr && size.dpr > 1) {
@ -82,25 +66,11 @@ export async function parseContextFromWebPage(
// console.timeEnd('resizeImgBase64');
}
let screenshotBase64WithElementMarker = screenshotBase64;
if (!getAIConfigInBoolean(MIDSCENE_USE_VLM_UI_TARS)) {
if (_opt?.ignoreMarker) {
screenshotBase64WithElementMarker = screenshotBase64;
} else {
screenshotBase64WithElementMarker = await compositeElementInfoImg({
inputImgBase64: screenshotBase64,
elementsPositionInfo: elementsPositionInfoWithoutText,
size,
});
}
}
return {
content: elementsInfo!,
tree: webTree,
size,
screenshotBase64: screenshotBase64!,
screenshotBase64WithElementMarker: screenshotBase64WithElementMarker,
url,
};
}