152 lines
4.7 KiB
TypeScript
Raw Normal View History

2024-04-20 16:38:05 -07:00
import { Request, Response } from "express";
import { checkTeamCredits } from "../../src/services/billing/credit_billing";
import { authenticateUser } from "./auth";
import { RateLimiterMode } from "../../src/types";
2024-08-13 21:03:24 +02:00
import { addScrapeJob } from "../../src/services/queue-jobs";
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
2024-05-06 17:16:43 -07:00
import { logCrawl } from "../../src/services/logging/crawl_log";
2024-05-07 15:29:27 -03:00
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../src/services/idempotency/create";
2024-06-26 09:00:54 -03:00
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
2024-07-24 14:31:25 +02:00
import { v4 as uuidv4 } from "uuid";
2024-07-25 09:48:06 -03:00
import { Logger } from "../../src/lib/logger";
2024-08-13 20:51:43 +02:00
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
2024-04-20 16:38:05 -07:00
export async function crawlController(req: Request, res: Response) {
try {
const { success, team_id, error, status } = await authenticateUser(
req,
res,
RateLimiterMode.Crawl
);
if (!success) {
return res.status(status).json({ error });
}
2024-05-07 15:29:27 -03:00
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
return res.status(409).json({ error: "Idempotency key already used" });
}
2024-05-23 11:47:04 -03:00
try {
createIdempotencyKey(req);
} catch (error) {
2024-07-25 09:48:06 -03:00
Logger.error(error);
2024-05-23 11:47:04 -03:00
return res.status(500).json({ error: error.message });
}
2024-05-07 15:29:27 -03:00
}
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1);
if (!creditsCheckSuccess) {
return res.status(402).json({ error: "Insufficient credits" });
2024-04-20 16:38:05 -07:00
}
const url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
if (isUrlBlocked(url)) {
2024-05-06 17:16:43 -07:00
return res
.status(403)
.json({
error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
});
}
2024-05-06 17:16:43 -07:00
2024-04-20 16:38:05 -07:00
const mode = req.body.mode ?? "crawl";
2024-06-26 09:00:54 -03:00
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
2024-04-20 16:38:05 -07:00
2024-08-13 20:51:43 +02:00
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => {
// job.updateProgress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
const id = uuidv4();
await logCrawl(id, team_id);
let robots;
try {
robots = await this.getRobotsTxt();
} catch (_) {}
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions,
pageOptions,
team_id,
robots,
};
await saveCrawl(id, sc);
const crawler = crawlToCrawler(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
if (sitemap !== null) {
for (const url of sitemap.map(x => x.url)) {
await lockURL(id, sc, url);
const job = await addScrapeJob({
url,
2024-04-20 16:38:05 -07:00
mode: "single_urls",
2024-08-13 20:51:43 +02:00
crawlerOptions: crawlerOptions,
team_id: team_id,
2024-04-20 16:38:05 -07:00
pageOptions: pageOptions,
2024-08-13 20:51:43 +02:00
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
sitemapped: true,
2024-04-20 16:38:05 -07:00
});
2024-08-13 20:51:43 +02:00
await addCrawlJob(id, job.id);
2024-04-20 16:38:05 -07:00
}
2024-08-13 20:51:43 +02:00
} else {
await lockURL(id, sc, url);
const job = await addScrapeJob({
url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
team_id: team_id,
pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin,
crawl_id: id,
});
await addCrawlJob(id, job.id);
2024-04-20 16:38:05 -07:00
}
2024-05-06 17:16:43 -07:00
2024-08-13 20:51:43 +02:00
res.json({ jobId: id });
2024-04-20 16:38:05 -07:00
} catch (error) {
2024-07-25 09:48:06 -03:00
Logger.error(error);
2024-04-20 16:38:05 -07:00
return res.status(500).json({ error: error.message });
}
}