fix(web-integration): add xss package and sanitize input in web-extractor

This commit is contained in:
quanruzhuoxiu 2025-05-20 20:32:53 +08:00
parent 049d9f6565
commit 7f4150c533
7 changed files with 113 additions and 8 deletions

View File

@ -59,7 +59,8 @@
"dependencies": {
"debug": "4.4.0",
"jimp": "0.22.12",
"js-sha256": "0.11.0"
"js-sha256": "0.11.0",
"xss": "1.0.15"
},
"devDependencies": {
"@modern-js/module-tools": "2.60.6",

View File

@ -1,3 +1,4 @@
import xss from 'xss';
import type { Rect } from '../types';
import { generateHashId } from '../utils';
import { extractTextWithPosition } from './web-extractor';
@ -427,7 +428,7 @@ export function getNodeAttributes(
return [];
}
let value = attr.value;
let value = xss(attr.value);
if (value.startsWith('data:image')) {
value = 'image';
}

View File

@ -1,3 +1,4 @@
import xss from 'xss';
import {
CONTAINER_MINI_HEIGHT,
CONTAINER_MINI_WIDTH,
@ -82,8 +83,6 @@ export function collectElementInfo(
const attributes = getNodeAttributes(node, currentWindow);
let valueContent =
attributes.value || attributes.placeholder || node.textContent || '';
const nodeHashId = midsceneGenerateHash(node, valueContent, rect);
const selector = setDataForNode(node, nodeHashId, false, currentWindow);
const tagName = (node as HTMLElement).tagName.toLowerCase();
if ((node as HTMLElement).tagName.toLowerCase() === 'select') {
// Get the selected option using the selectedIndex property
@ -103,6 +102,11 @@ export function collectElementInfo(
valueContent = (node as HTMLInputElement).value;
}
valueContent = xss(valueContent);
const nodeHashId = midsceneGenerateHash(node, valueContent, rect);
const selector = setDataForNode(node, nodeHashId, false, currentWindow);
const elementInfo: WebElementInfo = {
id: nodeHashId,
nodeHashId,
@ -128,7 +132,10 @@ export function collectElementInfo(
if (isButtonElement(node)) {
const attributes = getNodeAttributes(node, currentWindow);
const pseudo = getPseudoElementContent(node, currentWindow);
const content = node.innerText || pseudo.before || pseudo.after || '';
let content = node.innerText || pseudo.before || pseudo.after || '';
content = xss(content);
const nodeHashId = midsceneGenerateHash(node, content, rect);
const selector = setDataForNode(node, nodeHashId, false, currentWindow);
const elementInfo: WebElementInfo = {
@ -185,10 +192,14 @@ export function collectElementInfo(
}
if (isTextElement(node)) {
const text = node.textContent?.trim().replace(/\n+/g, ' ');
let text = node.textContent?.trim().replace(/\n+/g, ' ');
if (!text) {
return null;
}
// Sanitize the text content to prevent XSS
text = xss(text);
const attributes = getNodeAttributes(node, currentWindow);
const attributeKeys = Object.keys(attributes);
if (!text.trim() && attributeKeys.length === 0) {

View File

@ -3116,6 +3116,31 @@ exports[`extractor > basic 2`] = `
}
`;
exports[`extractor > collectElementInfo from xss page 1`] = `
"<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Child Page</title>
</head>
<body>
<h1>XSS Injection Examples</h1>
<button>hello world!</button>
<input
value="<script>
setTimeout(() => {
alert('1');
}, 5000);
</script>
hello world!" />
</input>
</body>
</html>
"
`;
exports[`extractor > getElementInfoByXpath by evaluateJavaScript 1`] = `
{
"attributes": {

View File

@ -0,0 +1,21 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Child Page</title>
</head>
<body>
<h1>XSS Injection Examples</h1>
<button>hello world!</button>
<input
value="<script>
setTimeout(() => {
alert('1');
}, 5000);
</script>
hello world!" />
</input>
</body>
</html>

View File

@ -1,8 +1,8 @@
import fs from 'node:fs';
import { join } from 'node:path';
import { parseContextFromWebPage } from '@/common/utils';
import StaticPage from '@/playground/static-page';
import type { WebElementInfo } from '@/web-element';
import { sleep } from '@midscene/core/utils';
import { traverseTree } from '@midscene/shared/extractor';
import { getElementInfosScriptContent } from '@midscene/shared/fs';
import {
@ -13,7 +13,6 @@ import {
import { createServer } from 'http-server';
import { beforeAll, describe, expect, it } from 'vitest';
import { launchPage } from '../ai/web/puppeteer/utils';
const pageDir = join(__dirname, './fixtures/web-extractor');
const pagePath = join(pageDir, 'index.html');
@ -244,6 +243,35 @@ describe(
expect(element).toBe(null);
await reset();
});
it('collectElementInfo from xss page', async () => {
const { page, reset } = await launchPage(
`http://127.0.0.1:${port}/xss.html`,
{
viewport: {
width: 1080,
height: 3000,
deviceScaleFactor: 1,
},
},
);
const xssHtmlContent = fs.readFileSync(
join(pageDir, 'xss.html'),
'utf-8',
);
expect(xssHtmlContent).toContain('<script>');
expect(xssHtmlContent).toMatchSnapshot();
const elementInfosScriptContent = getElementInfosScriptContent();
const element = await page.evaluateJavaScript?.(
`${elementInfosScriptContent}midscene_element_inspector.getElementInfoByXpath('/html/body/input')`,
);
expect(element.content).not.toContain('<script>');
expect(element.attributes?.value).not.toContain('<script>');
await reset();
});
},
{
timeout: 90 * 1000,

18
pnpm-lock.yaml generated
View File

@ -597,6 +597,9 @@ importers:
js-sha256:
specifier: 0.11.0
version: 0.11.0
xss:
specifier: 1.0.15
version: 1.0.15
devDependencies:
'@modern-js/module-tools':
specifier: 2.60.6
@ -6641,6 +6644,9 @@ packages:
engines: {node: '>=4'}
hasBin: true
cssfilter@0.0.10:
resolution: {integrity: sha512-FAaLDaplstoRsDR8XGYH51znUN0UY7nMc6Z9/fvE8EXGwvJE9hu7W2vHwx1+bd6gCYnln9nLbzxFTrcO9YQDZw==}
cssnano-preset-default@6.1.2:
resolution: {integrity: sha512-1C0C+eNaeN8OcHQa193aRgYexyJtU8XwbdieEjClw+J9d94E41LwT6ivKH0WT+fYwYWB0Zp3I3IZ7tI/BbUbrg==}
engines: {node: ^14 || ^16 || >=18.0}
@ -12916,6 +12922,11 @@ packages:
resolution: {integrity: sha512-TEU+nJVUUnA4CYJFLvK5X9AOeH4KvDvhIfm0vV1GaQRtchnG0hgK5p8hw/xjv8cunWYCsiPCSDzObPyhEwq3KQ==}
engines: {node: '>=0.4.0'}
xss@1.0.15:
resolution: {integrity: sha512-FVdlVVC67WOIPvfOwhoMETV72f6GbW7aOabBC3WxN/oUdoEMDyLz4OgRv5/gck2ZeNqEQu+Tb0kloovXOfpYVg==}
engines: {node: '>= 0.10.0'}
hasBin: true
xtend@4.0.2:
resolution: {integrity: sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==}
engines: {node: '>=0.4'}
@ -21227,6 +21238,8 @@ snapshots:
cssesc@3.0.0: {}
cssfilter@0.0.10: {}
cssnano-preset-default@6.1.2(postcss@8.4.27):
dependencies:
browserslist: 4.24.4
@ -29252,6 +29265,11 @@ snapshots:
xmlhttprequest-ssl@2.1.2: {}
xss@1.0.15:
dependencies:
commander: 2.20.3
cssfilter: 0.0.10
xtend@4.0.2: {}
y18n@4.0.3: {}