From 09aabbedb58b5927b85ebf773a54664153156a04 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 20 Jun 2025 12:02:23 -0300 Subject: [PATCH] feat: add followInternalLinks parameter as semantic replacement for allowBackwardLinks (#1684) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add followInternalLinks parameter as semantic replacement for allowBackwardLinks - Add followInternalLinks parameter to crawl API with same functionality as allowBackwardLinks - Update transformation logic to use followInternalLinks with precedence over allowBackwardLinks - Add parameter to Python SDK crawl methods with proper precedence handling - Add parameter to Node.js SDK CrawlParams interface - Add comprehensive tests for new parameter and backward compatibility - Maintain full backward compatibility for existing allowBackwardLinks usage - Add deprecation notices in documentation while preserving functionality Co-Authored-By: Nick * fix: revert accidental cache=True changes to preserve original cache parameter handling - Revert cache=True back to cache=cache in generate_llms_text methods - Preserve original parameter passing behavior for cache parameter - Fix accidental hardcoding of cache parameter to True Co-Authored-By: Nick * refactor: rename followInternalLinks to crawlEntireDomain across API, SDKs, and tests - Rename followInternalLinks parameter to crawlEntireDomain in API schema - Update Node.js SDK CrawlParams interface to use crawlEntireDomain - Update Python SDK methods to use crawl_entire_domain parameter - Update test cases to use new crawlEntireDomain parameter name - Maintain backward compatibility with allowBackwardLinks - Update transformation logic to use crawlEntireDomain with precedence Co-Authored-By: Nick * fix: add missing cache parameter to generate_llms_text and update documentation references Co-Authored-By: Nick * Update apps/python-sdk/firecrawl/firecrawl.py --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Nick Co-authored-by: Gergő Móricz --- apps/api/src/__tests__/snips/crawl.test.ts | 40 ++++++++++++++++++++++ apps/api/src/controllers/v1/types.ts | 23 +++++++++---- apps/js-sdk/firecrawl/src/index.ts | 1 + apps/python-sdk/firecrawl/firecrawl.py | 39 +++++++++++++++------ 4 files changed, 86 insertions(+), 17 deletions(-) diff --git a/apps/api/src/__tests__/snips/crawl.test.ts b/apps/api/src/__tests__/snips/crawl.test.ts index 3e77b2cc7..8903458a3 100644 --- a/apps/api/src/__tests__/snips/crawl.test.ts +++ b/apps/api/src/__tests__/snips/crawl.test.ts @@ -100,4 +100,44 @@ describe("Crawl tests", () => { // } // } // }, 300000); + + it.concurrent("crawlEntireDomain parameter works", async () => { + const res = await crawl({ + url: "https://firecrawl.dev", + crawlEntireDomain: true, + limit: 5, + }); + + expect(res.success).toBe(true); + if (res.success) { + expect(res.completed).toBeGreaterThan(0); + } + }, 120000); + + it.concurrent("crawlEntireDomain takes precedence over allowBackwardLinks", async () => { + const res = await crawl({ + url: "https://firecrawl.dev", + allowBackwardLinks: false, + crawlEntireDomain: true, + limit: 5, + }); + + expect(res.success).toBe(true); + if (res.success) { + expect(res.completed).toBeGreaterThan(0); + } + }, 120000); + + it.concurrent("backward compatibility - allowBackwardLinks still works", async () => { + const res = await crawl({ + url: "https://firecrawl.dev", + allowBackwardLinks: true, + limit: 5, + }); + + expect(res.success).toBe(true); + if (res.success) { + expect(res.completed).toBeGreaterThan(0); + } + }, 120000); }); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 42e40d051..ca35ab5bc 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -615,7 +615,8 @@ const crawlerOptions = z maxDepth: z.number().default(10), // default? maxDiscoveryDepth: z.number().optional(), limit: z.number().default(10000), // default? - allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? + allowBackwardLinks: z.boolean().default(false), // DEPRECATED: use crawlEntireDomain + crawlEntireDomain: z.boolean().optional(), allowExternalLinks: z.boolean().default(false), allowSubdomains: z.boolean().default(false), ignoreRobotsTxt: z.boolean().default(false), @@ -632,7 +633,8 @@ const crawlerOptions = z // excludePaths?: string[]; // maxDepth?: number; // limit?: number; -// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME??? +// allowBackwardLinks?: boolean; // DEPRECATED: use crawlEntireDomain +// crawlEntireDomain?: boolean; // allowExternalLinks?: boolean; // ignoreSitemap?: boolean; // }; @@ -652,10 +654,15 @@ export const crawlRequestSchema = crawlerOptions .strict(strictMessage) .refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts) .refine((x) => fire1Refine(x.scrapeOptions), fire1RefineOpts) - .transform((x) => ({ - ...x, - scrapeOptions: extractTransform(x.scrapeOptions), - })); + .transform((x) => { + if (x.crawlEntireDomain !== undefined) { + x.allowBackwardLinks = x.crawlEntireDomain; + } + return { + ...x, + scrapeOptions: extractTransform(x.scrapeOptions), + }; + }); // export type CrawlRequest = { // url: string; @@ -1041,7 +1048,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) { maxDepth: x.maxDepth, limit: x.limit, generateImgAltText: false, - allowBackwardCrawling: x.allowBackwardLinks, + allowBackwardCrawling: x.crawlEntireDomain ?? x.allowBackwardLinks, allowExternalContentLinks: x.allowExternalLinks, allowSubdomains: x.allowSubdomains, ignoreRobotsTxt: x.ignoreRobotsTxt, @@ -1062,6 +1069,7 @@ export function toNewCrawlerOptions(x: any): CrawlerOptions { limit: x.limit, maxDepth: x.maxDepth, allowBackwardLinks: x.allowBackwardCrawling, + crawlEntireDomain: x.allowBackwardCrawling, allowExternalLinks: x.allowExternalContentLinks, allowSubdomains: x.allowSubdomains, ignoreRobotsTxt: x.ignoreRobotsTxt, @@ -1085,6 +1093,7 @@ export function fromLegacyCrawlerOptions(x: any, teamId: string): { limit: x.maxCrawledLinks ?? x.limit, maxDepth: x.maxDepth, allowBackwardLinks: x.allowBackwardCrawling, + crawlEntireDomain: x.allowBackwardCrawling, allowExternalLinks: x.allowExternalContentLinks, allowSubdomains: x.allowSubdomains, ignoreRobotsTxt: x.ignoreRobotsTxt, diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 05939ae01..5ec70baaf 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -209,6 +209,7 @@ export interface CrawlParams { maxDiscoveryDepth?: number; limit?: number; allowBackwardLinks?: boolean; + crawlEntireDomain?: boolean; allowExternalLinks?: boolean; ignoreSitemap?: boolean; scrapeOptions?: CrawlScrapeOptions; diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index bc1d0479c..b425cff53 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -687,6 +687,7 @@ class FirecrawlApp: max_discovery_depth: Optional[int] = None, limit: Optional[int] = None, allow_backward_links: Optional[bool] = None, + crawl_entire_domain: Optional[bool] = None, allow_external_links: Optional[bool] = None, ignore_sitemap: Optional[bool] = None, scrape_options: Optional[ScrapeOptions] = None, @@ -710,7 +711,8 @@ class FirecrawlApp: max_depth (Optional[int]): Maximum crawl depth max_discovery_depth (Optional[int]): Maximum depth for finding new URLs limit (Optional[int]): Maximum pages to crawl - allow_backward_links (Optional[bool]): Follow parent directory links + allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead + crawl_entire_domain (Optional[bool]): Follow parent directory links allow_external_links (Optional[bool]): Follow external domain links ignore_sitemap (Optional[bool]): Skip sitemap.xml processing scrape_options (Optional[ScrapeOptions]): Page scraping configuration @@ -749,7 +751,9 @@ class FirecrawlApp: crawl_params['maxDiscoveryDepth'] = max_discovery_depth if limit is not None: crawl_params['limit'] = limit - if allow_backward_links is not None: + if crawl_entire_domain is not None: + crawl_params['crawlEntireDomain'] = crawl_entire_domain + elif allow_backward_links is not None: crawl_params['allowBackwardLinks'] = allow_backward_links if allow_external_links is not None: crawl_params['allowExternalLinks'] = allow_external_links @@ -802,6 +806,7 @@ class FirecrawlApp: max_discovery_depth: Optional[int] = None, limit: Optional[int] = None, allow_backward_links: Optional[bool] = None, + crawl_entire_domain: Optional[bool] = None, allow_external_links: Optional[bool] = None, ignore_sitemap: Optional[bool] = None, scrape_options: Optional[ScrapeOptions] = None, @@ -823,7 +828,8 @@ class FirecrawlApp: max_depth (Optional[int]): Maximum crawl depth max_discovery_depth (Optional[int]): Maximum depth for finding new URLs limit (Optional[int]): Maximum pages to crawl - allow_backward_links (Optional[bool]): Follow parent directory links + allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead + crawl_entire_domain (Optional[bool]): Follow parent directory links allow_external_links (Optional[bool]): Follow external domain links ignore_sitemap (Optional[bool]): Skip sitemap.xml processing scrape_options (Optional[ScrapeOptions]): Page scraping configuration @@ -862,7 +868,9 @@ class FirecrawlApp: crawl_params['maxDiscoveryDepth'] = max_discovery_depth if limit is not None: crawl_params['limit'] = limit - if allow_backward_links is not None: + if crawl_entire_domain is not None: + crawl_params['crawlEntireDomain'] = crawl_entire_domain + elif allow_backward_links is not None: crawl_params['allowBackwardLinks'] = allow_backward_links if allow_external_links is not None: crawl_params['allowExternalLinks'] = allow_external_links @@ -1051,6 +1059,7 @@ class FirecrawlApp: max_discovery_depth: Optional[int] = None, limit: Optional[int] = None, allow_backward_links: Optional[bool] = None, + crawl_entire_domain: Optional[bool] = None, allow_external_links: Optional[bool] = None, ignore_sitemap: Optional[bool] = None, scrape_options: Optional[ScrapeOptions] = None, @@ -1073,7 +1082,8 @@ class FirecrawlApp: max_depth (Optional[int]): Maximum crawl depth max_discovery_depth (Optional[int]): Maximum depth for finding new URLs limit (Optional[int]): Maximum pages to crawl - allow_backward_links (Optional[bool]): Follow parent directory links + allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead + crawl_entire_domain (Optional[bool]): Follow parent directory links allow_external_links (Optional[bool]): Follow external domain links ignore_sitemap (Optional[bool]): Skip sitemap.xml processing scrape_options (Optional[ScrapeOptions]): Page scraping configuration @@ -2811,7 +2821,8 @@ class AsyncFirecrawlApp(FirecrawlApp): * limit - Maximum pages to crawl Link Following: - * allowBackwardLinks - Follow parent directory links + * allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead + * crawlEntireDomain - Follow parent directory links * allowExternalLinks - Follow external domain links * ignoreSitemap - Skip sitemap.xml processing @@ -3290,6 +3301,7 @@ class AsyncFirecrawlApp(FirecrawlApp): max_discovery_depth: Optional[int] = None, limit: Optional[int] = None, allow_backward_links: Optional[bool] = None, + crawl_entire_domain: Optional[bool] = None, allow_external_links: Optional[bool] = None, ignore_sitemap: Optional[bool] = None, scrape_options: Optional[ScrapeOptions] = None, @@ -3312,7 +3324,8 @@ class AsyncFirecrawlApp(FirecrawlApp): max_depth (Optional[int]): Maximum crawl depth max_discovery_depth (Optional[int]): Maximum depth for finding new URLs limit (Optional[int]): Maximum pages to crawl - allow_backward_links (Optional[bool]): Follow parent directory links + allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead + crawl_entire_domain (Optional[bool]): Follow parent directory links allow_external_links (Optional[bool]): Follow external domain links ignore_sitemap (Optional[bool]): Skip sitemap.xml processing scrape_options (Optional[ScrapeOptions]): Page scraping configuration @@ -3350,7 +3363,9 @@ class AsyncFirecrawlApp(FirecrawlApp): crawl_params['maxDiscoveryDepth'] = max_discovery_depth if limit is not None: crawl_params['limit'] = limit - if allow_backward_links is not None: + if crawl_entire_domain is not None: + crawl_params['crawlEntireDomain'] = crawl_entire_domain + elif allow_backward_links is not None: crawl_params['allowBackwardLinks'] = allow_backward_links if allow_external_links is not None: crawl_params['allowExternalLinks'] = allow_external_links @@ -3402,6 +3417,7 @@ class AsyncFirecrawlApp(FirecrawlApp): max_discovery_depth: Optional[int] = None, limit: Optional[int] = None, allow_backward_links: Optional[bool] = None, + crawl_entire_domain: Optional[bool] = None, allow_external_links: Optional[bool] = None, ignore_sitemap: Optional[bool] = None, scrape_options: Optional[ScrapeOptions] = None, @@ -3424,7 +3440,8 @@ class AsyncFirecrawlApp(FirecrawlApp): max_depth (Optional[int]): Maximum crawl depth max_discovery_depth (Optional[int]): Maximum depth for finding new URLs limit (Optional[int]): Maximum pages to crawl - allow_backward_links (Optional[bool]): Follow parent directory links + allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead + crawl_entire_domain (Optional[bool]): Follow parent directory links allow_external_links (Optional[bool]): Follow external domain links ignore_sitemap (Optional[bool]): Skip sitemap.xml processing scrape_options (Optional[ScrapeOptions]): Page scraping configuration @@ -3458,7 +3475,9 @@ class AsyncFirecrawlApp(FirecrawlApp): crawl_params['maxDiscoveryDepth'] = max_discovery_depth if limit is not None: crawl_params['limit'] = limit - if allow_backward_links is not None: + if crawl_entire_domain is not None: + crawl_params['crawlEntireDomain'] = crawl_entire_domain + elif allow_backward_links is not None: crawl_params['allowBackwardLinks'] = allow_backward_links if allow_external_links is not None: crawl_params['allowExternalLinks'] = allow_external_links