feat: add followInternalLinks parameter as semantic replacement for allowBackwardLinks (#1684)

* feat: add followInternalLinks parameter as semantic replacement for allowBackwardLinks

- Add followInternalLinks parameter to crawl API with same functionality as allowBackwardLinks
- Update transformation logic to use followInternalLinks with precedence over allowBackwardLinks
- Add parameter to Python SDK crawl methods with proper precedence handling
- Add parameter to Node.js SDK CrawlParams interface
- Add comprehensive tests for new parameter and backward compatibility
- Maintain full backward compatibility for existing allowBackwardLinks usage
- Add deprecation notices in documentation while preserving functionality

Co-Authored-By: Nick <nicolascamara29@gmail.com>

* fix: revert accidental cache=True changes to preserve original cache parameter handling

- Revert cache=True back to cache=cache in generate_llms_text methods
- Preserve original parameter passing behavior for cache parameter
- Fix accidental hardcoding of cache parameter to True

Co-Authored-By: Nick <nicolascamara29@gmail.com>

* refactor: rename followInternalLinks to crawlEntireDomain across API, SDKs, and tests

- Rename followInternalLinks parameter to crawlEntireDomain in API schema
- Update Node.js SDK CrawlParams interface to use crawlEntireDomain
- Update Python SDK methods to use crawl_entire_domain parameter
- Update test cases to use new crawlEntireDomain parameter name
- Maintain backward compatibility with allowBackwardLinks
- Update transformation logic to use crawlEntireDomain with precedence

Co-Authored-By: Nick <nicolascamara29@gmail.com>

* fix: add missing cache parameter to generate_llms_text and update documentation references

Co-Authored-By: Nick <nicolascamara29@gmail.com>

* Update apps/python-sdk/firecrawl/firecrawl.py

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Nick <nicolascamara29@gmail.com>
Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
This commit is contained in:
devin-ai-integration[bot] 2025-06-20 12:02:23 -03:00 committed by GitHub
parent f939428264
commit 09aabbedb5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 86 additions and 17 deletions

View File

@ -100,4 +100,44 @@ describe("Crawl tests", () => {
// }
// }
// }, 300000);
it.concurrent("crawlEntireDomain parameter works", async () => {
const res = await crawl({
url: "https://firecrawl.dev",
crawlEntireDomain: true,
limit: 5,
});
expect(res.success).toBe(true);
if (res.success) {
expect(res.completed).toBeGreaterThan(0);
}
}, 120000);
it.concurrent("crawlEntireDomain takes precedence over allowBackwardLinks", async () => {
const res = await crawl({
url: "https://firecrawl.dev",
allowBackwardLinks: false,
crawlEntireDomain: true,
limit: 5,
});
expect(res.success).toBe(true);
if (res.success) {
expect(res.completed).toBeGreaterThan(0);
}
}, 120000);
it.concurrent("backward compatibility - allowBackwardLinks still works", async () => {
const res = await crawl({
url: "https://firecrawl.dev",
allowBackwardLinks: true,
limit: 5,
});
expect(res.success).toBe(true);
if (res.success) {
expect(res.completed).toBeGreaterThan(0);
}
}, 120000);
});

View File

@ -615,7 +615,8 @@ const crawlerOptions = z
maxDepth: z.number().default(10), // default?
maxDiscoveryDepth: z.number().optional(),
limit: z.number().default(10000), // default?
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowBackwardLinks: z.boolean().default(false), // DEPRECATED: use crawlEntireDomain
crawlEntireDomain: z.boolean().optional(),
allowExternalLinks: z.boolean().default(false),
allowSubdomains: z.boolean().default(false),
ignoreRobotsTxt: z.boolean().default(false),
@ -632,7 +633,8 @@ const crawlerOptions = z
// excludePaths?: string[];
// maxDepth?: number;
// limit?: number;
// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
// allowBackwardLinks?: boolean; // DEPRECATED: use crawlEntireDomain
// crawlEntireDomain?: boolean;
// allowExternalLinks?: boolean;
// ignoreSitemap?: boolean;
// };
@ -652,10 +654,15 @@ export const crawlRequestSchema = crawlerOptions
.strict(strictMessage)
.refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts)
.refine((x) => fire1Refine(x.scrapeOptions), fire1RefineOpts)
.transform((x) => ({
...x,
scrapeOptions: extractTransform(x.scrapeOptions),
}));
.transform((x) => {
if (x.crawlEntireDomain !== undefined) {
x.allowBackwardLinks = x.crawlEntireDomain;
}
return {
...x,
scrapeOptions: extractTransform(x.scrapeOptions),
};
});
// export type CrawlRequest = {
// url: string;
@ -1041,7 +1048,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
maxDepth: x.maxDepth,
limit: x.limit,
generateImgAltText: false,
allowBackwardCrawling: x.allowBackwardLinks,
allowBackwardCrawling: x.crawlEntireDomain ?? x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks,
allowSubdomains: x.allowSubdomains,
ignoreRobotsTxt: x.ignoreRobotsTxt,
@ -1062,6 +1069,7 @@ export function toNewCrawlerOptions(x: any): CrawlerOptions {
limit: x.limit,
maxDepth: x.maxDepth,
allowBackwardLinks: x.allowBackwardCrawling,
crawlEntireDomain: x.allowBackwardCrawling,
allowExternalLinks: x.allowExternalContentLinks,
allowSubdomains: x.allowSubdomains,
ignoreRobotsTxt: x.ignoreRobotsTxt,
@ -1085,6 +1093,7 @@ export function fromLegacyCrawlerOptions(x: any, teamId: string): {
limit: x.maxCrawledLinks ?? x.limit,
maxDepth: x.maxDepth,
allowBackwardLinks: x.allowBackwardCrawling,
crawlEntireDomain: x.allowBackwardCrawling,
allowExternalLinks: x.allowExternalContentLinks,
allowSubdomains: x.allowSubdomains,
ignoreRobotsTxt: x.ignoreRobotsTxt,

View File

@ -209,6 +209,7 @@ export interface CrawlParams {
maxDiscoveryDepth?: number;
limit?: number;
allowBackwardLinks?: boolean;
crawlEntireDomain?: boolean;
allowExternalLinks?: boolean;
ignoreSitemap?: boolean;
scrapeOptions?: CrawlScrapeOptions;

View File

@ -687,6 +687,7 @@ class FirecrawlApp:
max_discovery_depth: Optional[int] = None,
limit: Optional[int] = None,
allow_backward_links: Optional[bool] = None,
crawl_entire_domain: Optional[bool] = None,
allow_external_links: Optional[bool] = None,
ignore_sitemap: Optional[bool] = None,
scrape_options: Optional[ScrapeOptions] = None,
@ -710,7 +711,8 @@ class FirecrawlApp:
max_depth (Optional[int]): Maximum crawl depth
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
limit (Optional[int]): Maximum pages to crawl
allow_backward_links (Optional[bool]): Follow parent directory links
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
crawl_entire_domain (Optional[bool]): Follow parent directory links
allow_external_links (Optional[bool]): Follow external domain links
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
@ -749,7 +751,9 @@ class FirecrawlApp:
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
if limit is not None:
crawl_params['limit'] = limit
if allow_backward_links is not None:
if crawl_entire_domain is not None:
crawl_params['crawlEntireDomain'] = crawl_entire_domain
elif allow_backward_links is not None:
crawl_params['allowBackwardLinks'] = allow_backward_links
if allow_external_links is not None:
crawl_params['allowExternalLinks'] = allow_external_links
@ -802,6 +806,7 @@ class FirecrawlApp:
max_discovery_depth: Optional[int] = None,
limit: Optional[int] = None,
allow_backward_links: Optional[bool] = None,
crawl_entire_domain: Optional[bool] = None,
allow_external_links: Optional[bool] = None,
ignore_sitemap: Optional[bool] = None,
scrape_options: Optional[ScrapeOptions] = None,
@ -823,7 +828,8 @@ class FirecrawlApp:
max_depth (Optional[int]): Maximum crawl depth
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
limit (Optional[int]): Maximum pages to crawl
allow_backward_links (Optional[bool]): Follow parent directory links
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
crawl_entire_domain (Optional[bool]): Follow parent directory links
allow_external_links (Optional[bool]): Follow external domain links
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
@ -862,7 +868,9 @@ class FirecrawlApp:
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
if limit is not None:
crawl_params['limit'] = limit
if allow_backward_links is not None:
if crawl_entire_domain is not None:
crawl_params['crawlEntireDomain'] = crawl_entire_domain
elif allow_backward_links is not None:
crawl_params['allowBackwardLinks'] = allow_backward_links
if allow_external_links is not None:
crawl_params['allowExternalLinks'] = allow_external_links
@ -1051,6 +1059,7 @@ class FirecrawlApp:
max_discovery_depth: Optional[int] = None,
limit: Optional[int] = None,
allow_backward_links: Optional[bool] = None,
crawl_entire_domain: Optional[bool] = None,
allow_external_links: Optional[bool] = None,
ignore_sitemap: Optional[bool] = None,
scrape_options: Optional[ScrapeOptions] = None,
@ -1073,7 +1082,8 @@ class FirecrawlApp:
max_depth (Optional[int]): Maximum crawl depth
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
limit (Optional[int]): Maximum pages to crawl
allow_backward_links (Optional[bool]): Follow parent directory links
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
crawl_entire_domain (Optional[bool]): Follow parent directory links
allow_external_links (Optional[bool]): Follow external domain links
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
@ -2811,7 +2821,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
* limit - Maximum pages to crawl
Link Following:
* allowBackwardLinks - Follow parent directory links
* allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
* crawlEntireDomain - Follow parent directory links
* allowExternalLinks - Follow external domain links
* ignoreSitemap - Skip sitemap.xml processing
@ -3290,6 +3301,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
max_discovery_depth: Optional[int] = None,
limit: Optional[int] = None,
allow_backward_links: Optional[bool] = None,
crawl_entire_domain: Optional[bool] = None,
allow_external_links: Optional[bool] = None,
ignore_sitemap: Optional[bool] = None,
scrape_options: Optional[ScrapeOptions] = None,
@ -3312,7 +3324,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
max_depth (Optional[int]): Maximum crawl depth
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
limit (Optional[int]): Maximum pages to crawl
allow_backward_links (Optional[bool]): Follow parent directory links
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
crawl_entire_domain (Optional[bool]): Follow parent directory links
allow_external_links (Optional[bool]): Follow external domain links
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
@ -3350,7 +3363,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
if limit is not None:
crawl_params['limit'] = limit
if allow_backward_links is not None:
if crawl_entire_domain is not None:
crawl_params['crawlEntireDomain'] = crawl_entire_domain
elif allow_backward_links is not None:
crawl_params['allowBackwardLinks'] = allow_backward_links
if allow_external_links is not None:
crawl_params['allowExternalLinks'] = allow_external_links
@ -3402,6 +3417,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
max_discovery_depth: Optional[int] = None,
limit: Optional[int] = None,
allow_backward_links: Optional[bool] = None,
crawl_entire_domain: Optional[bool] = None,
allow_external_links: Optional[bool] = None,
ignore_sitemap: Optional[bool] = None,
scrape_options: Optional[ScrapeOptions] = None,
@ -3424,7 +3440,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
max_depth (Optional[int]): Maximum crawl depth
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
limit (Optional[int]): Maximum pages to crawl
allow_backward_links (Optional[bool]): Follow parent directory links
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
crawl_entire_domain (Optional[bool]): Follow parent directory links
allow_external_links (Optional[bool]): Follow external domain links
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
@ -3458,7 +3475,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
if limit is not None:
crawl_params['limit'] = limit
if allow_backward_links is not None:
if crawl_entire_domain is not None:
crawl_params['crawlEntireDomain'] = crawl_entire_domain
elif allow_backward_links is not None:
crawl_params['allowBackwardLinks'] = allow_backward_links
if allow_external_links is not None:
crawl_params['allowExternalLinks'] = allow_external_links