mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-09-25 16:29:43 +00:00
180 lines
6.8 KiB
TypeScript
180 lines
6.8 KiB
TypeScript
import { WebScraperDataProvider } from "../index";
|
|
|
|
describe("WebScraperDataProvider", () => {
|
|
describe("replaceImgPathsWithAbsolutePaths", () => {
|
|
it("should replace image paths with absolute paths", () => {
|
|
const webScraperDataProvider = new WebScraperDataProvider();
|
|
const documents = [
|
|
{
|
|
metadata: { sourceURL: "https://example.com/page" },
|
|
content: "",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/another-page" },
|
|
content: "",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/another-page" },
|
|
content: "",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/data-image" },
|
|
content: "",
|
|
},
|
|
];
|
|
|
|
const expectedDocuments = [
|
|
{
|
|
metadata: { sourceURL: "https://example.com/page" },
|
|
content: "",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/another-page" },
|
|
content: "",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/another-page" },
|
|
content: "",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/data-image" },
|
|
content: "",
|
|
},
|
|
];
|
|
|
|
const result =
|
|
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
|
expect(result).toEqual(expectedDocuments);
|
|
});
|
|
|
|
it("should handle absolute URLs without modification", () => {
|
|
const webScraperDataProvider = new WebScraperDataProvider();
|
|
const documents = [
|
|
{
|
|
metadata: { sourceURL: "https://example.com/page" },
|
|
content: "",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/another-page" },
|
|
content:
|
|
"",
|
|
},
|
|
];
|
|
|
|
const expectedDocuments = [
|
|
{
|
|
metadata: { sourceURL: "https://example.com/page" },
|
|
content: "",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/another-page" },
|
|
content:
|
|
"",
|
|
},
|
|
];
|
|
|
|
const result =
|
|
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
|
expect(result).toEqual(expectedDocuments);
|
|
});
|
|
|
|
it("should not replace non-image content within the documents", () => {
|
|
const webScraperDataProvider = new WebScraperDataProvider();
|
|
const documents = [
|
|
{
|
|
metadata: { sourceURL: "https://example.com/page" },
|
|
content:
|
|
"This is a test.  Here is a link: [Example](https://example.com).",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/another-page" },
|
|
content:
|
|
"Another test.  Here is some **bold text**.",
|
|
},
|
|
];
|
|
|
|
const expectedDocuments = [
|
|
{
|
|
metadata: { sourceURL: "https://example.com/page" },
|
|
content:
|
|
"This is a test.  Here is a link: [Example](https://example.com).",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/another-page" },
|
|
content:
|
|
"Another test.  Here is some **bold text**.",
|
|
},
|
|
];
|
|
|
|
const result =
|
|
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
|
expect(result).toEqual(expectedDocuments);
|
|
});
|
|
it("should replace multiple image paths within the documents", () => {
|
|
const webScraperDataProvider = new WebScraperDataProvider();
|
|
const documents = [
|
|
{
|
|
metadata: { sourceURL: "https://example.com/page" },
|
|
content:
|
|
"This is a test.  Here is a link: [Example](https://example.com). ",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/another-page" },
|
|
content:
|
|
"Another test.  Here is some **bold text**. ",
|
|
},
|
|
];
|
|
|
|
const expectedDocuments = [
|
|
{
|
|
metadata: { sourceURL: "https://example.com/page" },
|
|
content:
|
|
"This is a test.  Here is a link: [Example](https://example.com). ",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/another-page" },
|
|
content:
|
|
"Another test.  Here is some **bold text**. ",
|
|
},
|
|
];
|
|
|
|
const result =
|
|
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
|
expect(result).toEqual(expectedDocuments);
|
|
});
|
|
|
|
it("should replace image paths within the documents with complex URLs", () => {
|
|
const webScraperDataProvider = new WebScraperDataProvider();
|
|
const documents = [
|
|
{
|
|
metadata: { sourceURL: "https://example.com/page/subpage" },
|
|
content:
|
|
"This is a test.  Here is a link: [Example](https://example.com). ",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/another-page/subpage" },
|
|
content:
|
|
"Another test.  Here is some **bold text**. ",
|
|
},
|
|
];
|
|
|
|
const expectedDocuments = [
|
|
{
|
|
metadata: { sourceURL: "https://example.com/page/subpage" },
|
|
content:
|
|
"This is a test.  Here is a link: [Example](https://example.com). ",
|
|
},
|
|
{
|
|
metadata: { sourceURL: "https://example.com/another-page/subpage" },
|
|
content:
|
|
"Another test.  Here is some **bold text**. ",
|
|
},
|
|
];
|
|
|
|
const result =
|
|
webScraperDataProvider.replaceImgPathsWithAbsolutePaths(documents);
|
|
expect(result).toEqual(expectedDocuments);
|
|
});
|
|
});
|
|
});
|