firecrawl/apps/api/src/controllers/v0/crawlPreview.ts

141 lines
4.0 KiB
TypeScript
Raw Normal View History

2024-04-20 16:38:05 -07:00
import { Request, Response } from "express";
2024-08-26 18:48:00 -03:00
import { authenticateUser } from "../auth";
2024-08-15 21:51:59 +02:00
import { RateLimiterMode } from "../../../src/types";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
2024-08-13 21:03:24 +02:00
import { v4 as uuidv4 } from "uuid";
2024-08-15 21:51:59 +02:00
import { Logger } from "../../../src/lib/logger";
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
import { addScrapeJob } from "../../../src/services/queue-jobs";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node";
2024-04-20 16:38:05 -07:00
export async function crawlPreviewController(req: Request, res: Response) {
try {
2024-08-21 22:20:40 -03:00
const { success, error, status, team_id:a, plan } = await authenticateUser(
2024-04-20 16:38:05 -07:00
req,
res,
RateLimiterMode.Preview
);
2024-08-13 21:03:24 +02:00
const team_id = "preview";
2024-04-20 16:38:05 -07:00
if (!success) {
return res.status(status).json({ error });
}
2024-08-13 21:03:24 +02:00
let url = req.body.url;
2024-04-20 16:38:05 -07:00
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
try {
url = checkAndUpdateURL(url).url;
} catch (e) {
return res
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
.json({ error: e.message ?? e });
}
if (isUrlBlocked(url)) {
2024-08-13 21:03:24 +02:00
return res
.status(403)
.json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
2024-04-20 16:38:05 -07:00
const crawlerOptions = req.body.crawlerOptions ?? {};
2024-08-22 15:15:45 -03:00
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
2024-04-20 19:37:45 -07:00
2024-08-13 21:03:24 +02:00
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => {
// job.updateProgress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
const id = uuidv4();
let robots;
try {
robots = await this.getRobotsTxt();
} catch (_) {}
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions,
pageOptions,
team_id,
2024-08-21 22:20:40 -03:00
plan,
2024-08-13 21:03:24 +02:00
robots,
2024-08-15 19:02:05 +02:00
createdAt: Date.now(),
2024-08-13 21:03:24 +02:00
};
await saveCrawl(id, sc);
const crawler = crawlToCrawler(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
if (sitemap !== null) {
for (const url of sitemap.map(x => x.url)) {
await lockURL(id, sc, url);
const job = await addScrapeJob({
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id,
plan,
2024-08-13 21:03:24 +02:00
pageOptions: pageOptions,
origin: "website-preview",
crawl_id: id,
sitemapped: true,
});
await addCrawlJob(id, job.id);
}
} else {
await lockURL(id, sc, url);
const job = await addScrapeJob({
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id,
plan,
2024-08-13 21:03:24 +02:00
pageOptions: pageOptions,
origin: "website-preview",
crawl_id: id,
});
await addCrawlJob(id, job.id);
}
2024-04-20 16:38:05 -07:00
2024-08-13 21:03:24 +02:00
res.json({ jobId: id });
2024-04-20 16:38:05 -07:00
} catch (error) {
Sentry.captureException(error);
2024-07-25 09:48:06 -03:00
Logger.error(error);
2024-04-20 16:38:05 -07:00
return res.status(500).json({ error: error.message });
}
}