From 7cf2e52fe6d460b1836fdaaf6f17f13da2fd5194 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 12 Mar 2025 18:46:57 +0100 Subject: [PATCH] feat(crawl): add maxDiscoveryDepth (#1329) --- apps/api/src/__tests__/snips/crawl.test.ts | 17 +++++++++++++++++ apps/api/src/controllers/v1/types.ts | 6 +++++- apps/api/src/lib/crawl-redis.ts | 3 +++ apps/api/src/scraper/WebScraper/crawler.ts | 13 +++++++++++++ apps/api/src/services/queue-worker.ts | 5 +++++ apps/js-sdk/firecrawl/src/index.ts | 1 + 6 files changed, 44 insertions(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/snips/crawl.test.ts b/apps/api/src/__tests__/snips/crawl.test.ts index f388243d1..599e34cfc 100644 --- a/apps/api/src/__tests__/snips/crawl.test.ts +++ b/apps/api/src/__tests__/snips/crawl.test.ts @@ -53,4 +53,21 @@ describe("Crawl tests", () => { } } }, 120000); + + it.concurrent("discovers URLs properly when maxDiscoveryDepth is provided", async () => { + const res = await crawl({ + url: "https://firecrawl.dev", + ignoreSitemap: true, + maxDiscoveryDepth: 1, + limit: 10, + }); + + expect(res.success).toBe(true); + if (res.success) { + expect(res.data.length).toBeGreaterThan(1); + for (const page of res.data) { + expect(page.metadata.url ?? page.metadata.sourceURL).not.toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog\/.+$/); + } + } + }, 120000); }); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 4f946f03a..5a71da9c6 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -440,6 +440,7 @@ const crawlerOptions = z includePaths: z.string().array().default([]), excludePaths: z.string().array().default([]), maxDepth: z.number().default(10), // default? + maxDiscoveryDepth: z.number().optional(), limit: z.number().default(10000), // default? allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? allowExternalLinks: z.boolean().default(false), @@ -793,6 +794,8 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) { deduplicateSimilarURLs: x.deduplicateSimilarURLs, ignoreQueryParameters: x.ignoreQueryParameters, regexOnFullURL: x.regexOnFullURL, + maxDiscoveryDepth: x.maxDiscoveryDepth, + currentDiscoveryDepth: 0, }; } @@ -814,7 +817,8 @@ export function fromLegacyCrawlerOptions(x: any): { deduplicateSimilarURLs: x.deduplicateSimilarURLs, ignoreQueryParameters: x.ignoreQueryParameters, regexOnFullURL: x.regexOnFullURL, - }), + maxDiscoveryDepth: x.maxDiscoveryDepth, + }), internalOptions: { v0CrawlOnlyUrls: x.returnOnlyUrls, }, diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 256d74352..b31605c70 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -379,6 +379,7 @@ export function crawlToCrawler( id: string, sc: StoredCrawl, newBase?: string, + crawlerOptions?: any, ): WebCrawler { const crawler = new WebCrawler({ jobId: id, @@ -399,6 +400,8 @@ export function crawlToCrawler( allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false, ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false, regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false, + maxDiscoveryDepth: sc.crawlerOptions?.maxDiscoveryDepth, + currentDiscoveryDepth: crawlerOptions?.currentDiscoveryDepth ?? 0, }); if (sc.robots !== undefined) { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index ea93110ad..5f26f817c 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -31,6 +31,8 @@ export class WebCrawler { private regexOnFullURL: boolean; private logger: typeof _logger; private sitemapsHit: Set = new Set(); + private maxDiscoveryDepth: number | undefined; + private currentDiscoveryDepth: number; constructor({ jobId, @@ -47,6 +49,8 @@ export class WebCrawler { allowSubdomains = false, ignoreRobotsTxt = false, regexOnFullURL = false, + maxDiscoveryDepth, + currentDiscoveryDepth, }: { jobId: string; initialUrl: string; @@ -62,6 +66,8 @@ export class WebCrawler { allowSubdomains?: boolean; ignoreRobotsTxt?: boolean; regexOnFullURL?: boolean; + maxDiscoveryDepth?: number; + currentDiscoveryDepth?: number; }) { this.jobId = jobId; this.initialUrl = initialUrl; @@ -81,6 +87,8 @@ export class WebCrawler { this.ignoreRobotsTxt = ignoreRobotsTxt ?? false; this.regexOnFullURL = regexOnFullURL ?? false; this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" }); + this.maxDiscoveryDepth = maxDiscoveryDepth; + this.currentDiscoveryDepth = currentDiscoveryDepth ?? 0; } public filterLinks( @@ -89,6 +97,11 @@ export class WebCrawler { maxDepth: number, fromMap: boolean = false, ): string[] { + if (this.currentDiscoveryDepth === this.maxDiscoveryDepth) { + this.logger.debug("Max discovery depth hit, filtering off all links", { currentDiscoveryDepth: this.currentDiscoveryDepth, maxDiscoveryDepth: this.maxDiscoveryDepth }); + return []; + } + // If the initial URL is a sitemap.xml, skip filtering if (this.initialUrl.endsWith("sitemap.xml") && fromMap) { return sitemapLinks.slice(0, limit); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 6287cf95e..5493da734 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -1044,6 +1044,7 @@ async function processJob(job: Job & { id: string }, token: string) { job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!, + job.data.crawlerOptions, ); const links = crawler.filterLinks( @@ -1088,6 +1089,10 @@ async function processJob(job: Job & { id: string }, token: string) { team_id: sc.team_id, scrapeOptions: scrapeOptions.parse(sc.scrapeOptions), internalOptions: sc.internalOptions, + crawlerOptions: { + ...sc.crawlerOptions, + currentDiscoveryDepth: (job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) + 1, + }, plan: job.data.plan, origin: job.data.origin, crawl_id: job.data.crawl_id, diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 99e008d3f..ab09432e5 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -160,6 +160,7 @@ export interface CrawlParams { includePaths?: string[]; excludePaths?: string[]; maxDepth?: number; + maxDiscoveryDepth?: number; limit?: number; allowBackwardLinks?: boolean; allowExternalLinks?: boolean;