mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-09-26 17:01:27 +00:00
97 lines
3.6 KiB
TypeScript
97 lines
3.6 KiB
TypeScript
![]() |
import { WebScraperDataProvider } from '../index';
|
||
|
|
||
|
describe('WebScraperDataProvider', () => {
|
||
|
describe('replaceImgPathsWithAbsolutePaths', () => {
|
||
|
it('should replace image paths with absolute paths', () => {
|
||
|
const webScraperDataProvider = new WebScraperDataProvider();
|
||
|
const documents = [
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/page' },
|
||
|
content: '',
|
||
|
},
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/another-page' },
|
||
|
content: '',
|
||
|
},
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/data-image' },
|
||
|
content: '',
|
||
|
}
|
||
|
];
|
||
|
|
||
|
const expectedDocuments = [
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/page' },
|
||
|
content: '',
|
||
|
},
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/another-page' },
|
||
|
content: '',
|
||
|
},
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/data-image' },
|
||
|
content: '',
|
||
|
}
|
||
|
];
|
||
|
|
||
|
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||
|
expect(result).toEqual(expectedDocuments);
|
||
|
});
|
||
|
|
||
|
it('should handle absolute URLs without modification', () => {
|
||
|
const webScraperDataProvider = new WebScraperDataProvider();
|
||
|
const documents = [
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/page' },
|
||
|
content: '',
|
||
|
},
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/another-page' },
|
||
|
content: '',
|
||
|
}
|
||
|
];
|
||
|
|
||
|
const expectedDocuments = [
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/page' },
|
||
|
content: '',
|
||
|
},
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/another-page' },
|
||
|
content: '',
|
||
|
}
|
||
|
];
|
||
|
|
||
|
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||
|
expect(result).toEqual(expectedDocuments);
|
||
|
});
|
||
|
|
||
|
it('should not replace non-image content within the documents', () => {
|
||
|
const webScraperDataProvider = new WebScraperDataProvider();
|
||
|
const documents = [
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/page' },
|
||
|
content: 'This is a test.  Here is a link: [Example](https://example.com).',
|
||
|
},
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/another-page' },
|
||
|
content: 'Another test.  Here is some **bold text**.',
|
||
|
}
|
||
|
];
|
||
|
|
||
|
const expectedDocuments = [
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/page' },
|
||
|
content: 'This is a test.  Here is a link: [Example](https://example.com).',
|
||
|
},
|
||
|
{
|
||
|
metadata: { sourceURL: 'https://example.com/another-page' },
|
||
|
content: 'Another test.  Here is some **bold text**.',
|
||
|
}
|
||
|
];
|
||
|
|
||
|
const result = webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
||
|
expect(result).toEqual(expectedDocuments);
|
||
|
});
|
||
|
});
|
||
|
});
|