mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-12-27 07:03:44 +00:00
feat: add followInternalLinks parameter as semantic replacement for allowBackwardLinks (#1684)
* feat: add followInternalLinks parameter as semantic replacement for allowBackwardLinks - Add followInternalLinks parameter to crawl API with same functionality as allowBackwardLinks - Update transformation logic to use followInternalLinks with precedence over allowBackwardLinks - Add parameter to Python SDK crawl methods with proper precedence handling - Add parameter to Node.js SDK CrawlParams interface - Add comprehensive tests for new parameter and backward compatibility - Maintain full backward compatibility for existing allowBackwardLinks usage - Add deprecation notices in documentation while preserving functionality Co-Authored-By: Nick <nicolascamara29@gmail.com> * fix: revert accidental cache=True changes to preserve original cache parameter handling - Revert cache=True back to cache=cache in generate_llms_text methods - Preserve original parameter passing behavior for cache parameter - Fix accidental hardcoding of cache parameter to True Co-Authored-By: Nick <nicolascamara29@gmail.com> * refactor: rename followInternalLinks to crawlEntireDomain across API, SDKs, and tests - Rename followInternalLinks parameter to crawlEntireDomain in API schema - Update Node.js SDK CrawlParams interface to use crawlEntireDomain - Update Python SDK methods to use crawl_entire_domain parameter - Update test cases to use new crawlEntireDomain parameter name - Maintain backward compatibility with allowBackwardLinks - Update transformation logic to use crawlEntireDomain with precedence Co-Authored-By: Nick <nicolascamara29@gmail.com> * fix: add missing cache parameter to generate_llms_text and update documentation references Co-Authored-By: Nick <nicolascamara29@gmail.com> * Update apps/python-sdk/firecrawl/firecrawl.py --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Nick <nicolascamara29@gmail.com> Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
This commit is contained in:
parent
f939428264
commit
09aabbedb5
@ -100,4 +100,44 @@ describe("Crawl tests", () => {
|
||||
// }
|
||||
// }
|
||||
// }, 300000);
|
||||
|
||||
it.concurrent("crawlEntireDomain parameter works", async () => {
|
||||
const res = await crawl({
|
||||
url: "https://firecrawl.dev",
|
||||
crawlEntireDomain: true,
|
||||
limit: 5,
|
||||
});
|
||||
|
||||
expect(res.success).toBe(true);
|
||||
if (res.success) {
|
||||
expect(res.completed).toBeGreaterThan(0);
|
||||
}
|
||||
}, 120000);
|
||||
|
||||
it.concurrent("crawlEntireDomain takes precedence over allowBackwardLinks", async () => {
|
||||
const res = await crawl({
|
||||
url: "https://firecrawl.dev",
|
||||
allowBackwardLinks: false,
|
||||
crawlEntireDomain: true,
|
||||
limit: 5,
|
||||
});
|
||||
|
||||
expect(res.success).toBe(true);
|
||||
if (res.success) {
|
||||
expect(res.completed).toBeGreaterThan(0);
|
||||
}
|
||||
}, 120000);
|
||||
|
||||
it.concurrent("backward compatibility - allowBackwardLinks still works", async () => {
|
||||
const res = await crawl({
|
||||
url: "https://firecrawl.dev",
|
||||
allowBackwardLinks: true,
|
||||
limit: 5,
|
||||
});
|
||||
|
||||
expect(res.success).toBe(true);
|
||||
if (res.success) {
|
||||
expect(res.completed).toBeGreaterThan(0);
|
||||
}
|
||||
}, 120000);
|
||||
});
|
||||
|
||||
@ -615,7 +615,8 @@ const crawlerOptions = z
|
||||
maxDepth: z.number().default(10), // default?
|
||||
maxDiscoveryDepth: z.number().optional(),
|
||||
limit: z.number().default(10000), // default?
|
||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||
allowBackwardLinks: z.boolean().default(false), // DEPRECATED: use crawlEntireDomain
|
||||
crawlEntireDomain: z.boolean().optional(),
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
allowSubdomains: z.boolean().default(false),
|
||||
ignoreRobotsTxt: z.boolean().default(false),
|
||||
@ -632,7 +633,8 @@ const crawlerOptions = z
|
||||
// excludePaths?: string[];
|
||||
// maxDepth?: number;
|
||||
// limit?: number;
|
||||
// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
|
||||
// allowBackwardLinks?: boolean; // DEPRECATED: use crawlEntireDomain
|
||||
// crawlEntireDomain?: boolean;
|
||||
// allowExternalLinks?: boolean;
|
||||
// ignoreSitemap?: boolean;
|
||||
// };
|
||||
@ -652,10 +654,15 @@ export const crawlRequestSchema = crawlerOptions
|
||||
.strict(strictMessage)
|
||||
.refine((x) => extractRefine(x.scrapeOptions), extractRefineOpts)
|
||||
.refine((x) => fire1Refine(x.scrapeOptions), fire1RefineOpts)
|
||||
.transform((x) => ({
|
||||
...x,
|
||||
scrapeOptions: extractTransform(x.scrapeOptions),
|
||||
}));
|
||||
.transform((x) => {
|
||||
if (x.crawlEntireDomain !== undefined) {
|
||||
x.allowBackwardLinks = x.crawlEntireDomain;
|
||||
}
|
||||
return {
|
||||
...x,
|
||||
scrapeOptions: extractTransform(x.scrapeOptions),
|
||||
};
|
||||
});
|
||||
|
||||
// export type CrawlRequest = {
|
||||
// url: string;
|
||||
@ -1041,7 +1048,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
||||
maxDepth: x.maxDepth,
|
||||
limit: x.limit,
|
||||
generateImgAltText: false,
|
||||
allowBackwardCrawling: x.allowBackwardLinks,
|
||||
allowBackwardCrawling: x.crawlEntireDomain ?? x.allowBackwardLinks,
|
||||
allowExternalContentLinks: x.allowExternalLinks,
|
||||
allowSubdomains: x.allowSubdomains,
|
||||
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
||||
@ -1062,6 +1069,7 @@ export function toNewCrawlerOptions(x: any): CrawlerOptions {
|
||||
limit: x.limit,
|
||||
maxDepth: x.maxDepth,
|
||||
allowBackwardLinks: x.allowBackwardCrawling,
|
||||
crawlEntireDomain: x.allowBackwardCrawling,
|
||||
allowExternalLinks: x.allowExternalContentLinks,
|
||||
allowSubdomains: x.allowSubdomains,
|
||||
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
||||
@ -1085,6 +1093,7 @@ export function fromLegacyCrawlerOptions(x: any, teamId: string): {
|
||||
limit: x.maxCrawledLinks ?? x.limit,
|
||||
maxDepth: x.maxDepth,
|
||||
allowBackwardLinks: x.allowBackwardCrawling,
|
||||
crawlEntireDomain: x.allowBackwardCrawling,
|
||||
allowExternalLinks: x.allowExternalContentLinks,
|
||||
allowSubdomains: x.allowSubdomains,
|
||||
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
||||
|
||||
@ -209,6 +209,7 @@ export interface CrawlParams {
|
||||
maxDiscoveryDepth?: number;
|
||||
limit?: number;
|
||||
allowBackwardLinks?: boolean;
|
||||
crawlEntireDomain?: boolean;
|
||||
allowExternalLinks?: boolean;
|
||||
ignoreSitemap?: boolean;
|
||||
scrapeOptions?: CrawlScrapeOptions;
|
||||
|
||||
@ -687,6 +687,7 @@ class FirecrawlApp:
|
||||
max_discovery_depth: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
allow_backward_links: Optional[bool] = None,
|
||||
crawl_entire_domain: Optional[bool] = None,
|
||||
allow_external_links: Optional[bool] = None,
|
||||
ignore_sitemap: Optional[bool] = None,
|
||||
scrape_options: Optional[ScrapeOptions] = None,
|
||||
@ -710,7 +711,8 @@ class FirecrawlApp:
|
||||
max_depth (Optional[int]): Maximum crawl depth
|
||||
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
||||
limit (Optional[int]): Maximum pages to crawl
|
||||
allow_backward_links (Optional[bool]): Follow parent directory links
|
||||
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
||||
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
||||
allow_external_links (Optional[bool]): Follow external domain links
|
||||
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
||||
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
||||
@ -749,7 +751,9 @@ class FirecrawlApp:
|
||||
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
||||
if limit is not None:
|
||||
crawl_params['limit'] = limit
|
||||
if allow_backward_links is not None:
|
||||
if crawl_entire_domain is not None:
|
||||
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
||||
elif allow_backward_links is not None:
|
||||
crawl_params['allowBackwardLinks'] = allow_backward_links
|
||||
if allow_external_links is not None:
|
||||
crawl_params['allowExternalLinks'] = allow_external_links
|
||||
@ -802,6 +806,7 @@ class FirecrawlApp:
|
||||
max_discovery_depth: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
allow_backward_links: Optional[bool] = None,
|
||||
crawl_entire_domain: Optional[bool] = None,
|
||||
allow_external_links: Optional[bool] = None,
|
||||
ignore_sitemap: Optional[bool] = None,
|
||||
scrape_options: Optional[ScrapeOptions] = None,
|
||||
@ -823,7 +828,8 @@ class FirecrawlApp:
|
||||
max_depth (Optional[int]): Maximum crawl depth
|
||||
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
||||
limit (Optional[int]): Maximum pages to crawl
|
||||
allow_backward_links (Optional[bool]): Follow parent directory links
|
||||
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
||||
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
||||
allow_external_links (Optional[bool]): Follow external domain links
|
||||
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
||||
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
||||
@ -862,7 +868,9 @@ class FirecrawlApp:
|
||||
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
||||
if limit is not None:
|
||||
crawl_params['limit'] = limit
|
||||
if allow_backward_links is not None:
|
||||
if crawl_entire_domain is not None:
|
||||
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
||||
elif allow_backward_links is not None:
|
||||
crawl_params['allowBackwardLinks'] = allow_backward_links
|
||||
if allow_external_links is not None:
|
||||
crawl_params['allowExternalLinks'] = allow_external_links
|
||||
@ -1051,6 +1059,7 @@ class FirecrawlApp:
|
||||
max_discovery_depth: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
allow_backward_links: Optional[bool] = None,
|
||||
crawl_entire_domain: Optional[bool] = None,
|
||||
allow_external_links: Optional[bool] = None,
|
||||
ignore_sitemap: Optional[bool] = None,
|
||||
scrape_options: Optional[ScrapeOptions] = None,
|
||||
@ -1073,7 +1082,8 @@ class FirecrawlApp:
|
||||
max_depth (Optional[int]): Maximum crawl depth
|
||||
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
||||
limit (Optional[int]): Maximum pages to crawl
|
||||
allow_backward_links (Optional[bool]): Follow parent directory links
|
||||
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
||||
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
||||
allow_external_links (Optional[bool]): Follow external domain links
|
||||
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
||||
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
||||
@ -2811,7 +2821,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
* limit - Maximum pages to crawl
|
||||
|
||||
Link Following:
|
||||
* allowBackwardLinks - Follow parent directory links
|
||||
* allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
|
||||
* crawlEntireDomain - Follow parent directory links
|
||||
* allowExternalLinks - Follow external domain links
|
||||
* ignoreSitemap - Skip sitemap.xml processing
|
||||
|
||||
@ -3290,6 +3301,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
max_discovery_depth: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
allow_backward_links: Optional[bool] = None,
|
||||
crawl_entire_domain: Optional[bool] = None,
|
||||
allow_external_links: Optional[bool] = None,
|
||||
ignore_sitemap: Optional[bool] = None,
|
||||
scrape_options: Optional[ScrapeOptions] = None,
|
||||
@ -3312,7 +3324,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
max_depth (Optional[int]): Maximum crawl depth
|
||||
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
||||
limit (Optional[int]): Maximum pages to crawl
|
||||
allow_backward_links (Optional[bool]): Follow parent directory links
|
||||
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
||||
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
||||
allow_external_links (Optional[bool]): Follow external domain links
|
||||
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
||||
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
||||
@ -3350,7 +3363,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
||||
if limit is not None:
|
||||
crawl_params['limit'] = limit
|
||||
if allow_backward_links is not None:
|
||||
if crawl_entire_domain is not None:
|
||||
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
||||
elif allow_backward_links is not None:
|
||||
crawl_params['allowBackwardLinks'] = allow_backward_links
|
||||
if allow_external_links is not None:
|
||||
crawl_params['allowExternalLinks'] = allow_external_links
|
||||
@ -3402,6 +3417,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
max_discovery_depth: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
allow_backward_links: Optional[bool] = None,
|
||||
crawl_entire_domain: Optional[bool] = None,
|
||||
allow_external_links: Optional[bool] = None,
|
||||
ignore_sitemap: Optional[bool] = None,
|
||||
scrape_options: Optional[ScrapeOptions] = None,
|
||||
@ -3424,7 +3440,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
max_depth (Optional[int]): Maximum crawl depth
|
||||
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
||||
limit (Optional[int]): Maximum pages to crawl
|
||||
allow_backward_links (Optional[bool]): Follow parent directory links
|
||||
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
||||
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
||||
allow_external_links (Optional[bool]): Follow external domain links
|
||||
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
||||
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
||||
@ -3458,7 +3475,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
||||
if limit is not None:
|
||||
crawl_params['limit'] = limit
|
||||
if allow_backward_links is not None:
|
||||
if crawl_entire_domain is not None:
|
||||
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
||||
elif allow_backward_links is not None:
|
||||
crawl_params['allowBackwardLinks'] = allow_backward_links
|
||||
if allow_external_links is not None:
|
||||
crawl_params['allowExternalLinks'] = allow_external_links
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user