2024-04-20 16:38:05 -07:00
import { Request , Response } from "express" ;
2024-08-15 21:51:59 +02:00
import { checkTeamCredits } from "../../../src/services/billing/credit_billing" ;
2024-08-26 18:48:00 -03:00
import { authenticateUser } from "../auth" ;
2024-08-15 21:51:59 +02:00
import { RateLimiterMode } from "../../../src/types" ;
import { addScrapeJob } from "../../../src/services/queue-jobs" ;
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist" ;
import { logCrawl } from "../../../src/services/logging/crawl_log" ;
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate" ;
import { createIdempotencyKey } from "../../../src/services/idempotency/create" ;
import { defaultCrawlPageOptions , defaultCrawlerOptions , defaultOrigin } from "../../../src/lib/default-values" ;
2024-07-24 14:31:25 +02:00
import { v4 as uuidv4 } from "uuid" ;
2024-08-15 21:51:59 +02:00
import { Logger } from "../../../src/lib/logger" ;
import { addCrawlJob , addCrawlJobs , crawlToCrawler , lockURL , lockURLs , saveCrawl , StoredCrawl } from "../../../src/lib/crawl-redis" ;
import { getScrapeQueue } from "../../../src/services/queue-service" ;
import { checkAndUpdateURL } from "../../../src/lib/validateUrl" ;
2024-08-22 03:55:40 +02:00
import * as Sentry from "@sentry/node" ;
2024-08-28 12:42:23 -03:00
import { getJobPriority } from "../../lib/job-priority" ;
2024-04-20 16:38:05 -07:00
export async function crawlController ( req : Request , res : Response ) {
try {
2024-09-25 19:25:18 +02:00
const { success , team_id , error , status , plan , chunk } = await authenticateUser (
2024-04-20 16:38:05 -07:00
req ,
res ,
RateLimiterMode . Crawl
) ;
if ( ! success ) {
return res . status ( status ) . json ( { error } ) ;
}
2024-04-21 10:36:48 -07:00
2024-05-07 15:29:27 -03:00
if ( req . headers [ "x-idempotency-key" ] ) {
const isIdempotencyValid = await validateIdempotencyKey ( req ) ;
if ( ! isIdempotencyValid ) {
return res . status ( 409 ) . json ( { error : "Idempotency key already used" } ) ;
}
2024-05-23 11:47:04 -03:00
try {
createIdempotencyKey ( req ) ;
} catch ( error ) {
2024-07-25 09:48:06 -03:00
Logger . error ( error ) ;
2024-05-23 11:47:04 -03:00
return res . status ( 500 ) . json ( { error : error.message } ) ;
}
2024-05-07 15:29:27 -03:00
}
2024-08-19 11:02:24 -03:00
const crawlerOptions = {
. . . defaultCrawlerOptions ,
. . . req . body . crawlerOptions ,
} ;
2024-08-19 11:01:26 -03:00
const pageOptions = { . . . defaultCrawlPageOptions , . . . req . body . pageOptions } ;
2024-08-22 13:29:11 +02:00
if ( Array . isArray ( crawlerOptions . includes ) ) {
for ( const x of crawlerOptions . includes ) {
try {
new RegExp ( x ) ;
} catch ( e ) {
return res . status ( 400 ) . json ( { error : e.message } ) ;
}
}
}
if ( Array . isArray ( crawlerOptions . excludes ) ) {
for ( const x of crawlerOptions . excludes ) {
try {
new RegExp ( x ) ;
} catch ( e ) {
return res . status ( 400 ) . json ( { error : e.message } ) ;
}
}
}
2024-08-20 14:16:54 -03:00
const limitCheck = req . body ? . crawlerOptions ? . limit ? ? 1 ;
const { success : creditsCheckSuccess , message : creditsCheckMessage , remainingCredits } =
2024-09-25 19:25:18 +02:00
await checkTeamCredits ( chunk , team_id , limitCheck ) ;
2024-08-19 11:01:26 -03:00
2024-04-21 10:36:48 -07:00
if ( ! creditsCheckSuccess ) {
2024-08-20 14:16:54 -03:00
return res . status ( 402 ) . json ( { error : "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" } ) ;
2024-04-20 16:38:05 -07:00
}
2024-08-20 14:16:54 -03:00
// TODO: need to do this to v1
crawlerOptions . limit = Math . min ( remainingCredits , crawlerOptions . limit ) ;
2024-08-15 19:19:02 +02:00
let url = req . body . url ;
2024-04-20 16:38:05 -07:00
if ( ! url ) {
return res . status ( 400 ) . json ( { error : "Url is required" } ) ;
}
2024-08-22 13:08:54 +02:00
if ( typeof url !== "string" ) {
return res . status ( 400 ) . json ( { error : "URL must be a string" } ) ;
}
2024-08-15 19:19:02 +02:00
try {
url = checkAndUpdateURL ( url ) . url ;
} catch ( e ) {
return res
. status ( e instanceof Error && e . message === "Invalid URL" ? 400 : 500 )
. json ( { error : e.message ? ? e } ) ;
}
2024-04-23 18:50:35 -03:00
if ( isUrlBlocked ( url ) ) {
2024-08-19 11:02:24 -03:00
return res . status ( 403 ) . json ( {
error :
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." ,
} ) ;
2024-04-23 18:50:35 -03:00
}
2024-05-06 17:16:43 -07:00
2024-08-13 20:51:43 +02:00
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try {
// const a = new WebScraperDataProvider();
// await a.setOptions({
// jobId: uuidv4(),
// mode: "single_urls",
// urls: [url],
// crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true },
// pageOptions: pageOptions,
// });
// const docs = await a.getDocuments(false, (progress) => {
// job.updateProgress({
// current: progress.current,
// total: progress.total,
// current_step: "SCRAPING",
// current_url: progress.currentDocumentUrl,
// });
// });
// return res.json({
// success: true,
// documents: docs,
// });
// } catch (error) {
// Logger.error(error);
// return res.status(500).json({ error: error.message });
// }
// }
const id = uuidv4 ( ) ;
await logCrawl ( id , team_id ) ;
const sc : StoredCrawl = {
originUrl : url ,
crawlerOptions ,
pageOptions ,
team_id ,
2024-08-21 22:20:40 -03:00
plan ,
2024-08-15 18:55:18 +02:00
createdAt : Date.now ( ) ,
2024-08-13 20:51:43 +02:00
} ;
const crawler = crawlToCrawler ( id , sc ) ;
2024-08-15 19:11:07 +02:00
try {
sc . robots = await crawler . getRobotsTxt ( ) ;
} catch ( _ ) { }
await saveCrawl ( id , sc ) ;
2024-08-19 11:02:24 -03:00
const sitemap = sc . crawlerOptions ? . ignoreSitemap
? null
: await crawler . tryGetSitemap ( ) ;
2024-08-13 20:51:43 +02:00
2024-08-27 15:01:58 -03:00
2024-08-22 13:40:55 +02:00
if ( sitemap !== null && sitemap . length > 0 ) {
2024-08-21 22:53:33 -03:00
let jobPriority = 20 ;
// If it is over 1000, we need to get the job priority,
// otherwise we can use the default priority of 20
if ( sitemap . length > 1000 ) {
// set base to 21
jobPriority = await getJobPriority ( { plan , team_id , basePriority : 21 } )
}
2024-08-19 11:02:24 -03:00
const jobs = sitemap . map ( ( x ) = > {
2024-08-14 20:34:19 +02:00
const url = x . url ;
const uuid = uuidv4 ( ) ;
return {
name : uuid ,
data : {
url ,
mode : "single_urls" ,
crawlerOptions : crawlerOptions ,
2024-09-28 00:19:46 +02:00
team_id ,
plan ,
2024-08-14 20:34:19 +02:00
pageOptions : pageOptions ,
origin : req.body.origin ? ? defaultOrigin ,
crawl_id : id ,
sitemapped : true ,
} ,
opts : {
jobId : uuid ,
2024-08-21 22:53:33 -03:00
priority : jobPriority ,
2024-08-19 11:02:24 -03:00
} ,
2024-08-14 20:34:19 +02:00
} ;
2024-08-19 11:02:24 -03:00
} ) ;
2024-08-14 20:34:19 +02:00
2024-08-19 11:02:24 -03:00
await lockURLs (
id ,
jobs . map ( ( x ) = > x . data . url )
) ;
await addCrawlJobs (
id ,
jobs . map ( ( x ) = > x . opts . jobId )
) ;
2024-08-22 16:47:38 +02:00
if ( Sentry . isInitialized ( ) ) {
for ( const job of jobs ) {
// add with sentry instrumentation
await addScrapeJob ( job . data as any , { } , job . opts . jobId ) ;
}
} else {
await getScrapeQueue ( ) . addBulk ( jobs ) ;
}
2024-08-13 20:51:43 +02:00
} else {
await lockURL ( id , sc , url ) ;
2024-08-21 22:20:40 -03:00
// Not needed, first one should be 15.
// const jobPriority = await getJobPriority({plan, team_id, basePriority: 10})
2024-08-19 11:02:24 -03:00
const job = await addScrapeJob (
{
url ,
mode : "single_urls" ,
crawlerOptions : crawlerOptions ,
2024-09-28 00:19:46 +02:00
team_id ,
plan ,
2024-08-19 11:02:24 -03:00
pageOptions : pageOptions ,
origin : req.body.origin ? ? defaultOrigin ,
crawl_id : id ,
} ,
{
priority : 15 , // prioritize request 0 of crawl jobs same as scrape jobs
}
) ;
2024-08-13 20:51:43 +02:00
await addCrawlJob ( id , job . id ) ;
2024-04-20 16:38:05 -07:00
}
2024-05-06 17:16:43 -07:00
2024-08-13 20:51:43 +02:00
res . json ( { jobId : id } ) ;
2024-04-20 16:38:05 -07:00
} catch ( error ) {
2024-08-22 03:55:40 +02:00
Sentry . captureException ( error ) ;
2024-07-25 09:48:06 -03:00
Logger . error ( error ) ;
2024-04-20 16:38:05 -07:00
return res . status ( 500 ) . json ( { error : error.message } ) ;
}
}