2024-08-21 17:24:32 +08:00
|
|
|
import path, { join } from 'node:path';
|
2024-08-20 07:41:08 +08:00
|
|
|
import { parseContextFromWebPage } from '@/common/utils';
|
2024-08-22 18:12:01 +08:00
|
|
|
import { generateExtractData } from '@/debug';
|
2024-10-21 16:30:07 +08:00
|
|
|
import StaticPage from '@/playground/static-page';
|
2024-12-08 20:12:17 +08:00
|
|
|
import type { WebElementInfo } from '@/web-element';
|
2024-10-28 11:04:40 +08:00
|
|
|
import { imageInfoOfBase64 } from '@midscene/shared/img';
|
2024-08-20 07:41:08 +08:00
|
|
|
import { describe, expect, it } from 'vitest';
|
2024-09-06 17:19:35 +08:00
|
|
|
import { launchPage } from '../ai/web/puppeteer/utils';
|
2024-08-20 07:41:08 +08:00
|
|
|
|
2024-09-05 20:05:19 +08:00
|
|
|
const pagePath = join(__dirname, './fixtures/web-extractor/index.html');
|
2024-08-21 16:28:58 +08:00
|
|
|
describe(
|
2024-08-20 07:41:08 +08:00
|
|
|
'extractor',
|
|
|
|
() => {
|
|
|
|
it('basic', async () => {
|
2024-08-31 08:17:50 +08:00
|
|
|
const { page, reset } = await launchPage(`file://${pagePath}`, {
|
|
|
|
viewport: {
|
|
|
|
width: 1080,
|
2024-12-16 15:04:21 +08:00
|
|
|
height: 3000,
|
2024-08-31 08:17:50 +08:00
|
|
|
},
|
|
|
|
});
|
2024-08-21 17:24:32 +08:00
|
|
|
|
2024-08-22 18:12:01 +08:00
|
|
|
const { content } = await parseContextFromWebPage(page);
|
|
|
|
await generateExtractData(
|
2024-08-21 17:24:32 +08:00
|
|
|
page,
|
2024-09-05 20:05:19 +08:00
|
|
|
path.join(__dirname, 'fixtures/web-extractor'),
|
2024-08-21 17:24:32 +08:00
|
|
|
{
|
|
|
|
disableInputImage: false,
|
|
|
|
disableOutputImage: false,
|
|
|
|
disableOutputWithoutTextImg: true,
|
|
|
|
disableResizeOutputImg: true,
|
|
|
|
disableSnapshot: true,
|
|
|
|
},
|
|
|
|
);
|
2024-08-20 07:41:08 +08:00
|
|
|
|
|
|
|
const list = content.map((item) => {
|
|
|
|
return {
|
|
|
|
content: item.content,
|
|
|
|
attributes: item.attributes,
|
|
|
|
};
|
|
|
|
});
|
2024-08-26 11:09:39 +08:00
|
|
|
|
2024-08-20 07:41:08 +08:00
|
|
|
expect(list).toMatchSnapshot();
|
2024-08-21 17:24:32 +08:00
|
|
|
await reset();
|
2024-08-20 07:41:08 +08:00
|
|
|
});
|
2024-08-26 11:09:39 +08:00
|
|
|
|
2024-12-08 20:12:17 +08:00
|
|
|
it('keep same id after resize', async () => {
|
|
|
|
const { page, reset } = await launchPage(
|
|
|
|
`file://${pagePath}?resize-after-3s=1`,
|
|
|
|
{
|
|
|
|
viewport: {
|
|
|
|
width: 1080,
|
|
|
|
height: 2000,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
);
|
|
|
|
|
|
|
|
const filterTargetElement = (items: WebElementInfo[]) => {
|
|
|
|
return items.find((item) => item.attributes?.id === 'J_resize');
|
|
|
|
};
|
|
|
|
|
|
|
|
const { content } = await parseContextFromWebPage(page);
|
|
|
|
const item = filterTargetElement(content);
|
|
|
|
expect(item).toBeDefined();
|
|
|
|
// check all the ids are different
|
|
|
|
const ids = content.map((item) => item.id);
|
|
|
|
const uniqueIds = new Set(ids);
|
|
|
|
expect(uniqueIds.size).toBe(ids.length);
|
|
|
|
|
|
|
|
await new Promise((resolve) => setTimeout(resolve, 3000 + 1000));
|
|
|
|
|
|
|
|
const { content: content2 } = await parseContextFromWebPage(page);
|
|
|
|
const item2 = filterTargetElement(content2);
|
|
|
|
expect(item2).toBeDefined();
|
|
|
|
expect(item2?.id).toBe(item?.id);
|
2024-12-16 15:04:21 +08:00
|
|
|
|
|
|
|
await reset();
|
2024-12-08 20:12:17 +08:00
|
|
|
});
|
|
|
|
|
2024-09-10 07:04:27 +08:00
|
|
|
it('check screenshot size - 1x', async () => {
|
|
|
|
const { page, reset } = await launchPage(`file://${pagePath}`, {
|
|
|
|
viewport: {
|
|
|
|
width: 1080,
|
|
|
|
height: 2000,
|
|
|
|
deviceScaleFactor: 1,
|
|
|
|
},
|
|
|
|
});
|
|
|
|
|
2024-10-28 11:04:40 +08:00
|
|
|
const shotBase64 = await page.screenshotBase64();
|
2024-09-10 07:04:27 +08:00
|
|
|
|
2024-10-28 11:04:40 +08:00
|
|
|
const info = await imageInfoOfBase64(shotBase64);
|
2024-09-10 07:04:27 +08:00
|
|
|
expect(info.height).toBe(2000);
|
2024-09-29 17:16:07 +08:00
|
|
|
expect(info.width).toBe(1080);
|
2024-09-10 07:04:27 +08:00
|
|
|
await reset();
|
|
|
|
});
|
|
|
|
|
|
|
|
it('check screenshot size - 2x', async () => {
|
|
|
|
const { page, reset } = await launchPage(`file://${pagePath}`, {
|
|
|
|
viewport: {
|
|
|
|
width: 1080,
|
|
|
|
height: 2000,
|
|
|
|
deviceScaleFactor: 2,
|
|
|
|
},
|
|
|
|
});
|
|
|
|
|
2024-10-28 11:04:40 +08:00
|
|
|
const shotBase64 = await page.screenshotBase64();
|
2024-09-10 07:04:27 +08:00
|
|
|
|
2024-10-28 11:04:40 +08:00
|
|
|
const info = await imageInfoOfBase64(shotBase64);
|
2024-11-05 14:28:16 +08:00
|
|
|
expect(info.width).toBe(2160);
|
|
|
|
expect(info.height).toBe(4000);
|
2024-09-10 07:04:27 +08:00
|
|
|
await reset();
|
|
|
|
});
|
|
|
|
|
2024-08-28 19:21:32 +08:00
|
|
|
it('scroll', async () => {
|
|
|
|
const { page, reset } = await launchPage(`file://${pagePath}`, {
|
|
|
|
viewport: {
|
|
|
|
width: 1080,
|
|
|
|
height: 200,
|
|
|
|
},
|
|
|
|
});
|
2024-12-19 10:44:08 +08:00
|
|
|
await page.scrollDown();
|
2024-08-28 19:21:32 +08:00
|
|
|
await new Promise((resolve) => setTimeout(resolve, 1000));
|
|
|
|
await generateExtractData(
|
|
|
|
page,
|
2024-09-05 20:05:19 +08:00
|
|
|
path.join(__dirname, 'fixtures/web-extractor/scroll'),
|
2024-08-28 19:21:32 +08:00
|
|
|
{
|
|
|
|
disableInputImage: false,
|
|
|
|
disableOutputImage: false,
|
|
|
|
disableOutputWithoutTextImg: true,
|
|
|
|
disableResizeOutputImg: true,
|
|
|
|
disableSnapshot: true,
|
|
|
|
},
|
|
|
|
);
|
2024-09-29 17:16:07 +08:00
|
|
|
await reset();
|
2024-08-28 19:21:32 +08:00
|
|
|
});
|
|
|
|
|
2024-12-08 20:12:17 +08:00
|
|
|
it('profiling', async () => {
|
2025-01-22 09:24:29 +08:00
|
|
|
const { page, reset } = await launchPage('https://www.bytedance.com');
|
2024-08-26 18:50:33 +08:00
|
|
|
await new Promise((resolve) => setTimeout(resolve, 1000));
|
2024-08-26 11:09:39 +08:00
|
|
|
console.time('total - parseContextFromWebPage');
|
2024-10-21 16:30:07 +08:00
|
|
|
await parseContextFromWebPage(page);
|
2024-08-26 11:09:39 +08:00
|
|
|
console.timeEnd('total - parseContextFromWebPage');
|
2024-08-26 18:50:33 +08:00
|
|
|
await reset();
|
2024-08-26 11:09:39 +08:00
|
|
|
});
|
2024-10-21 16:30:07 +08:00
|
|
|
|
|
|
|
it('static page with fixed context', async () => {
|
|
|
|
const fakeContext = {
|
|
|
|
foo: 'bar',
|
|
|
|
};
|
|
|
|
const page = new StaticPage(fakeContext as any);
|
|
|
|
|
|
|
|
const context = await parseContextFromWebPage(page);
|
|
|
|
expect(context).toBe(fakeContext);
|
|
|
|
});
|
2024-08-20 07:41:08 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
timeout: 90 * 1000,
|
|
|
|
},
|
|
|
|
);
|