fix(report): reduce context size in report file (#626)

* fix(core): reduce context size in report file * chore(core): fix lint * chore(core): resolve conflict --------- Co-authored-by: zhouxiao.shaw <zhouxiao.shaw@bytedance.com>
2025-12-26 14:38:57 +00:00 · 2025-04-24 18:28:45 +08:00 · 2025-04-24 18:28:45 +08:00 · ecefd8b0fa
commit ecefd8b0fa
parent ce7929bbbc
15 changed files with 108 additions and 114 deletions
--- a/apps/report/rsbuild.config.ts
+++ b/apps/report/rsbuild.config.ts
@ -5,7 +5,7 @@ import { pluginLess } from '@rsbuild/plugin-less';
 import { pluginNodePolyfill } from '@rsbuild/plugin-node-polyfill';
 import { pluginReact } from '@rsbuild/plugin-react';

-const testDataPath = path.join(__dirname, 'test-data', 'online-order.json');
+const testDataPath = path.join(__dirname, 'test-data', 'swag-lab.json');
 const testData = JSON.parse(fs.readFileSync(testDataPath, 'utf-8'));

 const copyReportTemplate = () => ({
--- a/apps/report/src/components/detail-panel.tsx
+++ b/apps/report/src/components/detail-panel.tsx
@ -33,12 +33,12 @@ const VIEW_TYPE_JSON = 'json';
 const DetailPanel = (): JSX.Element => {
  const insightDump = useExecutionDump((store) => store.insightDump);
  const dumpId = useExecutionDump((store) => store._insightDumpLoadId);
-  const blackboardViewAvailable = Boolean(insightDump);
  const activeExecution = useExecutionDump((store) => store.activeExecution);
  const activeExecutionId = useExecutionDump(
    (store) => store._executionDumpLoadId,
  );
  const activeTask = useExecutionDump((store) => store.activeTask);
+  const blackboardViewAvailable = Boolean(activeTask?.pageContext);
  const [preferredViewType, setViewType] = useState(VIEW_TYPE_REPLAY);
  const animationScripts = useExecutionDump(
    (store) => store.activeExecutionAnimation,
@ -89,7 +89,7 @@ const DetailPanel = (): JSX.Element => {
    if (blackboardViewAvailable) {
      content = (
        <Blackboard
-          uiContext={insightDump!.context}
+          uiContext={activeTask.pageContext}
          highlightElements={insightDump!.matchedElement}
          highlightRect={insightDump!.taskInfo?.searchArea}
          key={`${dumpId}`}
--- a/apps/report/test-data/swag-lab.json
+++ b/apps/report/test-data/swag-lab.json
--- a/packages/core/src/ai-model/common.ts
+++ b/packages/core/src/ai-model/common.ts
@ -1,4 +1,10 @@
-import type { AIUsageInfo, Rect, Size } from '@/types';
+import type {
+  AIUsageInfo,
+  BaseElement,
+  ElementTreeNode,
+  Rect,
+  Size,
+} from '@/types';
 import { assert } from '@midscene/shared/utils';

 import type {
@ -13,6 +19,9 @@ import {

 import { vlLocateMode } from '@/env';
 import type { PlanningLocateParam } from '@/types';
+import { NodeType } from '@midscene/shared/constants';
+import { treeToList } from '@midscene/shared/extractor';
+import { compositeElementInfoImg } from '@midscene/shared/img';
 import { getDebug } from '@midscene/shared/logger';

 export type AIArgs = [
@ -261,3 +270,26 @@ export function expandSearchArea(rect: Rect, screenSize: Size) {
  );
  return rect;
 }
+
+export async function markupImageForLLM(
+  screenshotBase64: string,
+  tree: ElementTreeNode<BaseElement>,
+  size: Size,
+) {
+  const elementsInfo = treeToList(tree);
+  const elementsPositionInfoWithoutText = elementsInfo!.filter(
+    (elementInfo) => {
+      if (elementInfo.attributes.nodeType === NodeType.TEXT) {
+        return false;
+      }
+      return true;
+    },
+  );
+
+  const imagePayload = await compositeElementInfoImg({
+    inputImgBase64: screenshotBase64,
+    elementsPositionInfo: elementsPositionInfoWithoutText as any,
+    size,
+  });
+  return imagePayload;
+}
--- a/packages/core/src/ai-model/inspect.ts
+++ b/packages/core/src/ai-model/inspect.ts
@ -32,6 +32,7 @@ import {
  adaptBboxToRect,
  callAiFn,
  expandSearchArea,
+  markupImageForLLM,
  mergeRects,
 } from './common';
 import { systemPromptToAssert } from './prompt/assertion';
@ -128,7 +129,7 @@ export async function AiLocateElement<
  usage?: AIUsageInfo;
 }> {
  const { context, targetElementDescription, callAI } = options;
-  const { screenshotBase64, screenshotBase64WithElementMarker } = context;
+  const { screenshotBase64 } = context;
  const { description, elementById, insertElementByPosition, size } =
    await describeUserPage(context);
  // meet quick answer
@ -153,7 +154,7 @@ export async function AiLocateElement<
  });
  const systemPrompt = systemPromptToLocateElement(vlLocateMode());

-  let imagePayload = screenshotBase64WithElementMarker || screenshotBase64;
+  let imagePayload = screenshotBase64;

  if (options.searchConfig) {
    assert(
@ -166,8 +167,14 @@ export async function AiLocateElement<
    );

    imagePayload = options.searchConfig.imageBase64;
-  } else if (getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL)) {
+  } else if (vlLocateMode() === 'qwen-vl') {
    imagePayload = await paddingToMatchBlockByBase64(imagePayload);
+  } else if (!vlLocateMode()) {
+    imagePayload = await markupImageForLLM(
+      screenshotBase64,
+      context.tree,
+      context.size,
+    );
  }

  const msgs: AIArgs = [
--- a/packages/core/src/ai-model/llm-planning.ts
+++ b/packages/core/src/ai-model/llm-planning.ts
@ -7,6 +7,7 @@ import {
  type AIArgs,
  callAiFn,
  fillLocateParam,
+  markupImageForLLM,
  warnGPT4oSizeLimit,
 } from './common';
 import {
@ -27,7 +28,7 @@ export async function plan(
  },
 ): Promise<PlanningAIResponse> {
  const { callAI, context } = opts || {};
-  const { screenshotBase64, screenshotBase64WithElementMarker, size } = context;
+  const { screenshotBase64, size } = context;
  const { description: pageDescription } = await describeUserPage(context);

  const systemPrompt = await systemPromptToTaskPlanning({
@ -46,9 +47,15 @@ export async function plan(
    taskBackgroundContext: taskBackgroundContextText,
  });

-  let imagePayload = screenshotBase64WithElementMarker || screenshotBase64;
+  let imagePayload = screenshotBase64;
  if (vlLocateMode() === 'qwen-vl') {
    imagePayload = await paddingToMatchBlockByBase64(imagePayload);
+  } else if (!vlLocateMode()) {
+    imagePayload = await markupImageForLLM(
+      screenshotBase64,
+      context.tree,
+      context.size,
+    );
  }

  warnGPT4oSizeLimit(size);
--- a/packages/core/src/ai-model/service-caller/index.ts
+++ b/packages/core/src/ai-model/service-caller/index.ts
@ -340,7 +340,7 @@ export async function callToGetJSONObject<T>(

  const model = getModelName();

-  if (model.includes('gpt-4o') || model.includes('gpt-4.1')) {
+  if (model.includes('gpt-4')) {
    switch (AIActionTypeValue) {
      case AIActionType.ASSERT:
        responseFormat = assertSchema;
--- a/packages/core/src/insight/index.ts
+++ b/packages/core/src/insight/index.ts
@ -158,7 +158,6 @@ export default class Insight<

    const dumpData: PartialInsightDumpFromSDK = {
      type: 'locate',
-      context,
      userQuery: {
        element: queryPrompt,
      },
@ -256,7 +255,6 @@ export default class Insight<

    const dumpData: PartialInsightDumpFromSDK = {
      type: 'extract',
-      context,
      userQuery: {
        dataDemand,
      },
@ -314,7 +312,6 @@ export default class Insight<
    const { thought, pass } = assertResult.content;
    const dumpData: PartialInsightDumpFromSDK = {
      type: 'assert',
-      context,
      userQuery: {
        assertion,
      },
--- a/packages/core/src/types.ts
+++ b/packages/core/src/types.ts
@ -119,8 +119,6 @@ export interface AIAssertionResponse {
 export abstract class UIContext<ElementType extends BaseElement = BaseElement> {
  abstract screenshotBase64: string;

-  abstract screenshotBase64WithElementMarker?: string;
-
  // @deprecated('use tree instead')
  abstract content: ElementType[];

@ -193,7 +191,6 @@ export interface ReportDumpWithAttributes {
 export interface InsightDump extends DumpMeta {
  type: 'locate' | 'extract' | 'assert';
  logId: string;
-  context: UIContext;
  userQuery: {
    element?: string;
    dataDemand?: InsightExtractParam;
--- a/packages/core/src/utils.ts
+++ b/packages/core/src/utils.ts
@ -146,6 +146,14 @@ export function writeDumpReport(
    return null;
  }
  writeFileSync(reportPath, reportContent);
+  if (process.env.MIDSCENE_DEBUG_LOG_JSON) {
+    writeFileSync(
+      `${reportPath}.json`,
+      typeof dumpData === 'string'
+        ? dumpData
+        : JSON.stringify(dumpData, null, 2),
+    );
+  }

  return reportPath;
 }
--- a/packages/shared/src/img/box-select.ts
+++ b/packages/shared/src/img/box-select.ts
@ -1,27 +1,28 @@
 import assert from 'node:assert';
 import type Jimp from 'jimp';
-import type { NodeType } from '../constants';
-import type { Rect } from '../types';
+import type { BaseElement } from '../types';
 import getJimp from './get-jimp';
 import { bufferFromBase64, imageInfoOfBase64 } from './index';

-// Define picture path
-type ElementType = {
-  locator?: string;
-  rect: Rect;
-  center?: [number, number];
-  id?: string;
-  indexId: number;
-  attributes?: {
-    nodeType: NodeType;
-    [key: string]: string;
-  };
-};
-
 let cachedFont: any = null;

+const loadFonts = async () => {
+  const Jimp = await getJimp();
+
+  try {
+    const fonts = await Jimp.loadFont(Jimp.FONT_SANS_16_WHITE);
+    return fonts;
+  } catch (error) {
+    console.warn('Error loading font, will try to load online fonts', error);
+    const onlineFonts =
+      'https://cdn.jsdelivr.net/npm/jimp-compact@0.16.1-2/fonts/open-sans/open-sans-16-white/open-sans-16-white.fnt';
+    const fonts = await Jimp.loadFont(onlineFonts);
+    return fonts;
+  }
+};
+
 const createSvgOverlay = async (
-  elements: Array<ElementType>,
+  elements: Array<BaseElement>,
  imageWidth: number,
  imageHeight: number,
  boxPadding = 5,
@ -78,7 +79,11 @@ const createSvgOverlay = async (
    );

    // Calculate text position
-    const textWidth = element.indexId.toString().length * 8;
+    const indexId = element.indexId;
+    if (typeof indexId !== 'number') {
+      continue;
+    }
+    const textWidth = indexId.toString().length * 8;
    const textHeight = 12;
    const rectWidth = textWidth + 5;
    const rectHeight = textHeight + 4;
@ -164,7 +169,7 @@ const createSvgOverlay = async (
    );
    // Draw text (simplified, as Jimp doesn't have built-in text drawing)
    try {
-      cachedFont = cachedFont || (await Jimp.loadFont(Jimp.FONT_SANS_16_WHITE));
+      cachedFont = cachedFont || (await loadFonts());
    } catch (error) {
      console.error('Error loading font', error);
    }
@ -173,7 +178,7 @@ const createSvgOverlay = async (
      rectX,
      rectY,
      {
-        text: element.indexId.toString(),
+        text: indexId.toString(),
        alignmentX: Jimp.HORIZONTAL_ALIGN_CENTER,
        alignmentY: Jimp.VERTICAL_ALIGN_MIDDLE,
      },
@ -187,7 +192,7 @@ const createSvgOverlay = async (

 export const compositeElementInfoImg = async (options: {
  inputImgBase64: string;
-  elementsPositionInfo: Array<ElementType>;
+  elementsPositionInfo: Array<BaseElement>;
  size?: { width: number; height: number };
  annotationPadding?: number;
 }) => {
@ -255,8 +260,8 @@ export const compositeElementInfoImg = async (options: {

 export const processImageElementInfo = async (options: {
  inputImgBase64: string;
-  elementsPositionInfo: Array<ElementType>;
-  elementsPositionInfoWithoutText: Array<ElementType>;
+  elementsPositionInfo: Array<BaseElement>;
+  elementsPositionInfoWithoutText: Array<BaseElement>;
 }) => {
  // Get the size of the original image
  const base64Image = options.inputImgBase64.split(';base64,').pop();
--- a/packages/visualizer/src/component/blackboard.tsx
+++ b/packages/visualizer/src/component/blackboard.tsx
@ -69,7 +69,7 @@ export const Blackboard = (props: {
  const highlightRect = props.highlightRect;

  const context = props.uiContext!;
-  const { size, screenshotBase64, screenshotBase64WithElementMarker } = context;
+  const { size, screenshotBase64 } = context;

  const screenWidth = size.width;
  const screenHeight = size.height;
@ -88,8 +88,6 @@ export const Blackboard = (props: {
  const { markerVisible, setMarkerVisible, elementsVisible, setTextsVisible } =
    useBlackboardPreference();

-  const ifMarkerAvailable = !!screenshotBase64WithElementMarker;
-
  useEffect(() => {
    Promise.resolve(
      (async () => {
@ -147,25 +145,6 @@ export const Blackboard = (props: {
      backgroundSprite.width = screenWidth;
      backgroundSprite.height = screenHeight;
      app.stage.addChildAt(backgroundSprite, 0);
-
-      if (ifMarkerAvailable) {
-        const markerImg = new Image();
-        markerImg.onload = () => {
-          const markerTexture = PIXI.Texture.from(markerImg);
-          const markerSprite = new PIXI.Sprite(markerTexture);
-          markerSprite.x = 0;
-          markerSprite.y = 0;
-          markerSprite.width = screenWidth;
-          markerSprite.height = screenHeight;
-          app.stage.addChildAt(markerSprite, 1);
-          pixiBgRef.current = markerSprite;
-          markerSprite.visible = markerVisible;
-        };
-        markerImg.onerror = (e) => {
-          console.error('load marker failed', e);
-        };
-        markerImg.src = screenshotBase64WithElementMarker;
-      }
    };
    img.onerror = (e) => {
      console.error('load screenshot failed', e);
@ -268,13 +247,6 @@ export const Blackboard = (props: {
        style={{ display: props.hideController ? 'none' : 'block' }}
      >
        <div className="overlay-control">
-          <Checkbox
-            checked={markerVisible}
-            onChange={onSetMarkerVisible}
-            disabled={!ifMarkerAvailable}
-          >
-            Marker
-          </Checkbox>
          <Checkbox checked={elementsVisible} onChange={onSetElementsVisible}>
            Elements
          </Checkbox>
--- a/packages/visualizer/src/component/player.tsx
+++ b/packages/visualizer/src/component/player.tsx
@ -619,7 +619,7 @@ export function Player(props?: {
            currentImg.current = item.img;
            await repaintImage();

-            const elements = item.insightDump.context.content;
+            const elements = item.context?.content || [];
            const highlightElements = item.insightDump.matchedElement;
            await insightElementsAnimation(
              elements,
--- a/packages/visualizer/src/component/replay-scripts.tsx
+++ b/packages/visualizer/src/component/replay-scripts.tsx
@ -6,12 +6,12 @@ import { paramStr, typeStr } from '@midscene/web/ui-utils';
 import type {
  ExecutionDump,
  ExecutionTask,
-  ExecutionTaskApply,
  ExecutionTaskInsightLocate,
  ExecutionTaskPlanning,
  GroupedActionDump,
  InsightDump,
  Rect,
+  UIContext,
 } from '@midscene/core';

 export interface CameraState {
@ -39,6 +39,7 @@ export interface AnimationScript {
  img?: string;
  camera?: TargetCameraState;
  insightDump?: InsightDump;
+  context?: UIContext;
  duration: number;
  insightCameraDuration?: number;
  title?: string;
@ -280,7 +281,7 @@ export const generateAnimationScripts = (
        });
        initSubTitle = paramStr(task);
      }
-    } else if (task.type === 'Insight') {
+    } else if (task.type === 'Insight' && task.subType === 'Locate') {
      const insightTask = task as ExecutionTaskInsightLocate;
      const resultElement = insightTask.output?.element;
      const title = typeStr(task);
@ -292,18 +293,16 @@ export const generateAnimationScripts = (
          pointerTop: resultElement.center[1],
        };
      }
-      if (insightTask.log?.dump) {
+      const context = insightTask.pageContext;
+      if (insightTask.log?.dump && context?.screenshotBase64) {
        const insightDump = insightTask.log.dump;
-        if (!insightDump?.context?.screenshotBase64) {
-          throw new Error('insight dump is required');
-        }
-        const insightContentLength = insightDump.context.content.length;
+        const insightContentLength = context.content.length;

-        if (insightDump.context.screenshotBase64WithElementMarker) {
+        if (context.screenshotBase64) {
          // show the original screenshot first
          scripts.push({
            type: 'img',
-            img: insightDump.context.screenshotBase64,
+            img: context.screenshotBase64,
            duration: stillAfterInsightDuration,
            title,
            subTitle,
@ -324,9 +323,8 @@ export const generateAnimationScripts = (

        scripts.push({
          type: 'insight',
-          img:
-            insightDump.context.screenshotBase64WithElementMarker ||
-            insightDump.context.screenshotBase64,
+          img: context.screenshotBase64,
+          context: context,
          insightDump: insightDump,
          camera: cameraState,
          duration:
@ -435,7 +433,7 @@ export const generateAnimationScripts = (
    });
  }

-  // console.log('replayscripts');
+  // console.log('replay scripts');
  // console.log(scripts, tasksIncluded);

  return scripts;
--- a/packages/web-integration/src/common/utils.ts
+++ b/packages/web-integration/src/common/utils.ts
@ -4,17 +4,11 @@ import type {
  PlaywrightParserOpt,
  UIContext,
 } from '@midscene/core';
-import {
-  MIDSCENE_REPORT_TAG_NAME,
-  MIDSCENE_USE_VLM_UI_TARS,
-  getAIConfig,
-  getAIConfigInBoolean,
-} from '@midscene/core/env';
+import { MIDSCENE_REPORT_TAG_NAME, getAIConfig } from '@midscene/core/env';
 import { uploadTestInfoToServer } from '@midscene/core/utils';
-import { NodeType } from '@midscene/shared/constants';
 import type { ElementInfo } from '@midscene/shared/extractor';
 import { traverseTree, treeToList } from '@midscene/shared/extractor';
-import { compositeElementInfoImg, resizeImgBase64 } from '@midscene/shared/img';
+import { resizeImgBase64 } from '@midscene/shared/img';
 import { assert, logMsg, uuid } from '@midscene/shared/utils';
 import dayjs from 'dayjs';
 import { WebElementInfo } from '../web-element';
@ -58,19 +52,9 @@ export async function parseContextFromWebPage(
    });
  });

-  const elementsInfo = treeToList(webTree);
-
  assert(screenshotBase64!, 'screenshotBase64 is required');

-  const elementsPositionInfoWithoutText = elementsInfo!.filter(
-    (elementInfo) => {
-      if (elementInfo.attributes.nodeType === NodeType.TEXT) {
-        return false;
-      }
-      return true;
-    },
-  );
-
+  const elementsInfo = treeToList(webTree);
  const size = await page.size();

  if (size.dpr && size.dpr > 1) {
@ -82,25 +66,11 @@ export async function parseContextFromWebPage(
    // console.timeEnd('resizeImgBase64');
  }

-  let screenshotBase64WithElementMarker = screenshotBase64;
-  if (!getAIConfigInBoolean(MIDSCENE_USE_VLM_UI_TARS)) {
-    if (_opt?.ignoreMarker) {
-      screenshotBase64WithElementMarker = screenshotBase64;
-    } else {
-      screenshotBase64WithElementMarker = await compositeElementInfoImg({
-        inputImgBase64: screenshotBase64,
-        elementsPositionInfo: elementsPositionInfoWithoutText,
-        size,
-      });
-    }
-  }
-
  return {
    content: elementsInfo!,
    tree: webTree,
    size,
    screenshotBase64: screenshotBase64!,
-    screenshotBase64WithElementMarker: screenshotBase64WithElementMarker,
    url,
  };
 }