firecrawl/apps/api/src/controllers/v0/crawl.ts

import { Request, Response } from "express";
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
import { authenticateUser } from "../auth";
import { RateLimiterMode } from "../../../src/types";
import { addScrapeJob } from "../../../src/services/queue-jobs";
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../../src/lib/logger";
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../../src/services/queue-service";
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
import * as Sentry from "@sentry/node";
import { getJobPriority } from "../../lib/job-priority";

export async function crawlController(req: Request, res: Response) {
  try {
    const { success, team_id, error, status, plan, chunk } = await authenticateUser(
      req,
      res,
      RateLimiterMode.Crawl
    );
    if (!success) {
      return res.status(status).json({ error });
    }

    if (req.headers["x-idempotency-key"]) {
      const isIdempotencyValid = await validateIdempotencyKey(req);
      if (!isIdempotencyValid) {
        return res.status(409).json({ error: "Idempotency key already used" });
      }
      try {
        createIdempotencyKey(req);
      } catch (error) {
        Logger.error(error);
        return res.status(500).json({ error: error.message });
      }
    }

    const crawlerOptions = {
      ...defaultCrawlerOptions,
      ...req.body.crawlerOptions,
    };
    const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };

    if (Array.isArray(crawlerOptions.includes)) {
      for (const x of crawlerOptions.includes) {
        try {
          new RegExp(x);
        } catch (e) {
          return res.status(400).json({ error: e.message });
        }
      }
    }

    if (Array.isArray(crawlerOptions.excludes)) {
      for (const x of crawlerOptions.excludes) {
        try {
          new RegExp(x);
        } catch (e) {
          return res.status(400).json({ error: e.message });
        }
      }
    }

    const limitCheck = req.body?.crawlerOptions?.limit ?? 1;
    const { success: creditsCheckSuccess, message: creditsCheckMessage, remainingCredits } =
      await checkTeamCredits(chunk, team_id, limitCheck);

    if (!creditsCheckSuccess) {
      return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" });
    }

    // TODO: need to do this to v1
    crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
    
    let url = req.body.url;
    if (!url) {
      return res.status(400).json({ error: "Url is required" });
    }
    if (typeof url !== "string") {
      return res.status(400).json({ error: "URL must be a string" });
    }
    try {
      url = checkAndUpdateURL(url).url;
    } catch (e) {
      return res
        .status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
        .json({ error: e.message ?? e });
    }

    if (isUrlBlocked(url)) {
      return res.status(403).json({
        error:
          "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
      });
    }

    // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
    //   try {
    //     const a = new WebScraperDataProvider();
    //     await a.setOptions({
    //       jobId: uuidv4(),
    //       mode: "single_urls",
    //       urls: [url],
    //       crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
    //       pageOptions: pageOptions,
    //     });

    //     const docs = await a.getDocuments(false, (progress) => {
    //       job.updateProgress({
    //         current: progress.current,
    //         total: progress.total,
    //         current_step: "SCRAPING",
    //         current_url: progress.currentDocumentUrl,
    //       });
    //     });
    //     return res.json({
    //       success: true,
    //       documents: docs,
    //     });
    //   } catch (error) {
    //     Logger.error(error);
    //     return res.status(500).json({ error: error.message });
    //   }
    // }

    const id = uuidv4();

    await logCrawl(id, team_id);

    const sc: StoredCrawl = {
      originUrl: url,
      crawlerOptions,
      pageOptions,
      team_id,
      plan,
      createdAt: Date.now(),
    };

    const crawler = crawlToCrawler(id, sc);

    try {
      sc.robots = await crawler.getRobotsTxt();
    } catch (_) {}

    await saveCrawl(id, sc);

    const sitemap = sc.crawlerOptions?.ignoreSitemap
      ? null
      : await crawler.tryGetSitemap();


    if (sitemap !== null && sitemap.length > 0) {
      let jobPriority = 20;
      // If it is over 1000, we need to get the job priority,
      // otherwise we can use the default priority of 20
      if(sitemap.length > 1000){
        // set base to 21
        jobPriority = await getJobPriority({plan, team_id, basePriority: 21})
      }
      const jobs = sitemap.map((x) => {
        const url = x.url;
        const uuid = uuidv4();
        return {
          name: uuid,
          data: {
            url,
            mode: "single_urls",
            crawlerOptions: crawlerOptions,
            team_id,
            plan,
            pageOptions: pageOptions,
            origin: req.body.origin ?? defaultOrigin,
            crawl_id: id,
            sitemapped: true,
          },
          opts: {
            jobId: uuid,
            priority: jobPriority,
          },
        };
      });

      await lockURLs(
        id,
        jobs.map((x) => x.data.url)
      );
      await addCrawlJobs(
        id,
        jobs.map((x) => x.opts.jobId)
      );
      if (Sentry.isInitialized()) {
        for (const job of jobs) {
          // add with sentry instrumentation
          await addScrapeJob(job.data as any, {}, job.opts.jobId);
        }
      } else {
        await getScrapeQueue().addBulk(jobs);
      }
    } else {
      await lockURL(id, sc, url);

      // Not needed, first one should be 15.
      // const jobPriority = await getJobPriority({plan, team_id, basePriority: 10})

      const job = await addScrapeJob(
        {
          url,
          mode: "single_urls",
          crawlerOptions: crawlerOptions,
          team_id,
          plan,
          pageOptions: pageOptions,
          origin: req.body.origin ?? defaultOrigin,
          crawl_id: id,
        },
        {
          priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
        }
      );
      await addCrawlJob(id, job.id);
    }

    res.json({ jobId: id });
  } catch (error) {
    Sentry.captureException(error);
    Logger.error(error);
    return res.status(500).json({ error: error.message });
  }
}
Nick: 2024-04-20 16:38:05 -07:00			`import { Request, Response } from "express";`
v1 restructure 2024-08-15 21:51:59 +02:00			`import { checkTeamCredits } from "../../../src/services/billing/credit_billing";`
Nick: 2024-08-26 18:48:00 -03:00			`import { authenticateUser } from "../auth";`
v1 restructure 2024-08-15 21:51:59 +02:00			`import { RateLimiterMode } from "../../../src/types";`
			`import { addScrapeJob } from "../../../src/services/queue-jobs";`
			`import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";`
			`import { logCrawl } from "../../../src/services/logging/crawl_log";`
			`import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";`
			`import { createIdempotencyKey } from "../../../src/services/idempotency/create";`
			`import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";`
feat: scrape event logging to DB 2024-07-24 14:31:25 +02:00			`import { v4 as uuidv4 } from "uuid";`
v1 restructure 2024-08-15 21:51:59 +02:00			`import { Logger } from "../../../src/lib/logger";`
			`import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";`
			`import { getScrapeQueue } from "../../../src/services/queue-service";`
			`import { checkAndUpdateURL } from "../../../src/lib/validateUrl";`
feat(sentry): add error handles to try-catch blocks 2024-08-22 03:55:40 +02:00			`import * as Sentry from "@sentry/node";`
Merge branch 'main' into v1-webscraper 2024-08-28 12:42:23 -03:00			`import { getJobPriority } from "../../lib/job-priority";`
Nick: 2024-04-20 16:38:05 -07:00
			`export async function crawlController(req: Request, res: Response) {`
			`try {`
feat(db): implement auth_credit_usage_chunk RPC 2024-09-25 19:25:18 +02:00			`const { success, team_id, error, status, plan, chunk } = await authenticateUser(`
Nick: 2024-04-20 16:38:05 -07:00			`req,`
			`res,`
			`RateLimiterMode.Crawl`
			`);`
			`if (!success) {`
			`return res.status(status).json({ error });`
			`}`
Nick: cleaner functions to handle authenticated requests that dont require ifs everywhere 2024-04-21 10:36:48 -07:00
Added idempotency key to crawl route 2024-05-07 15:29:27 -03:00			`if (req.headers["x-idempotency-key"]) {`
			`const isIdempotencyValid = await validateIdempotencyKey(req);`
			`if (!isIdempotencyValid) {`
			`return res.status(409).json({ error: "Idempotency key already used" });`
			`}`
bugfix on idempotency key check 2024-05-23 11:47:04 -03:00			`try {`
			`createIdempotencyKey(req);`
			`} catch (error) {`
updated logs 2024-07-25 09:48:06 -03:00			`Logger.error(error);`
bugfix on idempotency key check 2024-05-23 11:47:04 -03:00			`return res.status(500).json({ error: error.message });`
			`}`
Added idempotency key to crawl route 2024-05-07 15:29:27 -03:00			`}`

Update crawl.ts 2024-08-19 11:02:24 -03:00			`const crawlerOptions = {`
			`...defaultCrawlerOptions,`
			`...req.body.crawlerOptions,`
			`};`
Update crawl.ts 2024-08-19 11:01:26 -03:00			`const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };`

fix(crawl): validate includes.excludes regexes 2024-08-22 13:29:11 +02:00			`if (Array.isArray(crawlerOptions.includes)) {`
			`for (const x of crawlerOptions.includes) {`
			`try {`
			`new RegExp(x);`
			`} catch (e) {`
			`return res.status(400).json({ error: e.message });`
			`}`
			`}`
			`}`

			`if (Array.isArray(crawlerOptions.excludes)) {`
			`for (const x of crawlerOptions.excludes) {`
			`try {`
			`new RegExp(x);`
			`} catch (e) {`
			`return res.status(400).json({ error: e.message });`
			`}`
			`}`
			`}`

Nick: set the crawl limit to the remaining credits 2024-08-20 14:16:54 -03:00			`const limitCheck = req.body?.crawlerOptions?.limit ?? 1;`
			`const { success: creditsCheckSuccess, message: creditsCheckMessage, remainingCredits } =`
feat(db): implement auth_credit_usage_chunk RPC 2024-09-25 19:25:18 +02:00			`await checkTeamCredits(chunk, team_id, limitCheck);`
Update crawl.ts 2024-08-19 11:01:26 -03:00
Nick: cleaner functions to handle authenticated requests that dont require ifs everywhere 2024-04-21 10:36:48 -07:00			`if (!creditsCheckSuccess) {`
Nick: set the crawl limit to the remaining credits 2024-08-20 14:16:54 -03:00			`return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" });`
Nick: 2024-04-20 16:38:05 -07:00			`}`

Nick: set the crawl limit to the remaining credits 2024-08-20 14:16:54 -03:00			`// TODO: need to do this to v1`
			`crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);`

use thomas's url validation 2024-08-15 19:19:02 +02:00			`let url = req.body.url;`
Nick: 2024-04-20 16:38:05 -07:00			`if (!url) {`
			`return res.status(400).json({ error: "Url is required" });`
			`}`
fix(crawl): send error if url is not a string Fixes FIRECRAWL-SCRAPER-JS-1E and FIRECRAWL-SCRAPER-JS-Z 2024-08-22 13:08:54 +02:00			`if (typeof url !== "string") {`
			`return res.status(400).json({ error: "URL must be a string" });`
			`}`
use thomas's url validation 2024-08-15 19:19:02 +02:00			`try {`
			`url = checkAndUpdateURL(url).url;`
			`} catch (e) {`
			`return res`
			`.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)`
			`.json({ error: e.message ?? e });`
			`}`
[Feat] Added blocklist for social media urls 2024-04-23 18:50:35 -03:00
			`if (isUrlBlocked(url)) {`
Update crawl.ts 2024-08-19 11:02:24 -03:00			`return res.status(403).json({`
			`error:`
			`"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",`
			`});`
[Feat] Added blocklist for social media urls 2024-04-23 18:50:35 -03:00			`}`
Nick: cancel job 2024-05-06 17:16:43 -07:00
feat: crawl to scrape conversion 2024-08-13 20:51:43 +02:00			`// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?`
			`// try {`
			`// const a = new WebScraperDataProvider();`
			`// await a.setOptions({`
			`// jobId: uuidv4(),`
			`// mode: "single_urls",`
			`// urls: [url],`
			`// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },`
			`// pageOptions: pageOptions,`
			`// });`

			`// const docs = await a.getDocuments(false, (progress) => {`
			`// job.updateProgress({`
			`// current: progress.current,`
			`// total: progress.total,`
			`// current_step: "SCRAPING",`
			`// current_url: progress.currentDocumentUrl,`
			`// });`
			`// });`
			`// return res.json({`
			`// success: true,`
			`// documents: docs,`
			`// });`
			`// } catch (error) {`
			`// Logger.error(error);`
			`// return res.status(500).json({ error: error.message });`
			`// }`
			`// }`

			`const id = uuidv4();`

			`await logCrawl(id, team_id);`

			`const sc: StoredCrawl = {`
			`originUrl: url,`
			`crawlerOptions,`
			`pageOptions,`
			`team_id,`
Nick: 2024-08-21 22:20:40 -03:00			`plan,`
fix: fix posthog, add dummy crawl DB items 2024-08-15 18:55:18 +02:00			`createdAt: Date.now(),`
feat: crawl to scrape conversion 2024-08-13 20:51:43 +02:00			`};`

			`const crawler = crawlToCrawler(id, sc);`

fix: robots.txt laoding 2024-08-15 19:11:07 +02:00			`try {`
			`sc.robots = await crawler.getRobotsTxt();`
			`} catch (_) {}`

			`await saveCrawl(id, sc);`

Update crawl.ts 2024-08-19 11:02:24 -03:00			`const sitemap = sc.crawlerOptions?.ignoreSitemap`
			`? null`
			`: await crawler.tryGetSitemap();`
feat: crawl to scrape conversion 2024-08-13 20:51:43 +02:00
Merge branch 'main' into nsc/job-priority 2024-08-27 15:01:58 -03:00
fix(crawl): don't use sitemap if it's empty Fixes FIRECRAWL-SCRAPER-JS-11 2024-08-22 13:40:55 +02:00			`if (sitemap !== null && sitemap.length > 0) {`
Nick: 2024-08-21 22:53:33 -03:00			`let jobPriority = 20;`
			`// If it is over 1000, we need to get the job priority,`
			`// otherwise we can use the default priority of 20`
			`if(sitemap.length > 1000){`
			`// set base to 21`
			`jobPriority = await getJobPriority({plan, team_id, basePriority: 21})`
			`}`
Update crawl.ts 2024-08-19 11:02:24 -03:00			`const jobs = sitemap.map((x) => {`
fix(crawl): submit sitemapped jobs in bulk 2024-08-14 20:34:19 +02:00			`const url = x.url;`
			`const uuid = uuidv4();`
			`return {`
			`name: uuid,`
			`data: {`
			`url,`
			`mode: "single_urls",`
			`crawlerOptions: crawlerOptions,`
feat(concurrency-limit): set limit based on plan 2024-09-28 00:19:46 +02:00			`team_id,`
			`plan,`
fix(crawl): submit sitemapped jobs in bulk 2024-08-14 20:34:19 +02:00			`pageOptions: pageOptions,`
			`origin: req.body.origin ?? defaultOrigin,`
			`crawl_id: id,`
			`sitemapped: true,`
			`},`
			`opts: {`
			`jobId: uuid,`
Nick: 2024-08-21 22:53:33 -03:00			`priority: jobPriority,`
Update crawl.ts 2024-08-19 11:02:24 -03:00			`},`
fix(crawl): submit sitemapped jobs in bulk 2024-08-14 20:34:19 +02:00			`};`
Update crawl.ts 2024-08-19 11:02:24 -03:00			`});`
fix(crawl): submit sitemapped jobs in bulk 2024-08-14 20:34:19 +02:00
Update crawl.ts 2024-08-19 11:02:24 -03:00			`await lockURLs(`
			`id,`
			`jobs.map((x) => x.data.url)`
			`);`
			`await addCrawlJobs(`
			`id,`
			`jobs.map((x) => x.opts.jobId)`
			`);`
feat(sentry): add trace continuity for queue 2024-08-22 16:47:38 +02:00			`if (Sentry.isInitialized()) {`
			`for (const job of jobs) {`
			`// add with sentry instrumentation`
			`await addScrapeJob(job.data as any, {}, job.opts.jobId);`
			`}`
			`} else {`
			`await getScrapeQueue().addBulk(jobs);`
			`}`
feat: crawl to scrape conversion 2024-08-13 20:51:43 +02:00			`} else {`
			`await lockURL(id, sc, url);`
Nick: 2024-08-21 22:20:40 -03:00
			`// Not needed, first one should be 15.`
			`// const jobPriority = await getJobPriority({plan, team_id, basePriority: 10})`

Update crawl.ts 2024-08-19 11:02:24 -03:00			`const job = await addScrapeJob(`
			`{`
			`url,`
			`mode: "single_urls",`
			`crawlerOptions: crawlerOptions,`
feat(concurrency-limit): set limit based on plan 2024-09-28 00:19:46 +02:00			`team_id,`
			`plan,`
Update crawl.ts 2024-08-19 11:02:24 -03:00			`pageOptions: pageOptions,`
			`origin: req.body.origin ?? defaultOrigin,`
			`crawl_id: id,`
			`},`
			`{`
			`priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs`
			`}`
			`);`
feat: crawl to scrape conversion 2024-08-13 20:51:43 +02:00			`await addCrawlJob(id, job.id);`
Nick: 2024-04-20 16:38:05 -07:00			`}`
Nick: cancel job 2024-05-06 17:16:43 -07:00
feat: crawl to scrape conversion 2024-08-13 20:51:43 +02:00			`res.json({ jobId: id });`
Nick: 2024-04-20 16:38:05 -07:00			`} catch (error) {`
feat(sentry): add error handles to try-catch blocks 2024-08-22 03:55:40 +02:00			`Sentry.captureException(error);`
updated logs 2024-07-25 09:48:06 -03:00			`Logger.error(error);`
Nick: 2024-04-20 16:38:05 -07:00			`return res.status(500).json({ error: error.message });`
			`}`
			`}`