Merge branch 'main' into nsc/new-extract

This commit is contained in:
Nicolas 2024-11-20 16:41:13 -08:00
commit c78dae178b
8 changed files with 1534 additions and 10 deletions

View File

@ -2,7 +2,7 @@
Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute) Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute)
If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to hello@mendable.ai for more or submit an issue! If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to help@firecrawl.com for more or submit an issue!
## Running the project locally ## Running the project locally

View File

@ -77,10 +77,10 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
- **LLM-ready formats**: markdown, structured data, screenshot, HTML, links, metadata - **LLM-ready formats**: markdown, structured data, screenshot, HTML, links, metadata
- **The hard stuff**: proxies, anti-bot mechanisms, dynamic content (js-rendered), output parsing, orchestration - **The hard stuff**: proxies, anti-bot mechanisms, dynamic content (js-rendered), output parsing, orchestration
- **Customizability**: exclude tags, crawl behind auth walls with custom headers, max crawl depth, etc... - **Customizability**: exclude tags, crawl behind auth walls with custom headers, max crawl depth, etc...
- **Media parsing**: pdfs, docx, images. - **Media parsing**: pdfs, docx, images
- **Reliability first**: designed to get the data you need - no matter how hard it is. - **Reliability first**: designed to get the data you need - no matter how hard it is
- **Actions**: click, scroll, input, wait and more before extracting data - **Actions**: click, scroll, input, wait and more before extracting data
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint - **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint.
You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev) You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)

View File

@ -230,6 +230,7 @@ const crawlerOptions = z.object({
limit: z.number().default(10000), // default? limit: z.number().default(10000), // default?
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false), allowExternalLinks: z.boolean().default(false),
allowSubdomains: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true), ignoreSitemap: z.boolean().default(true),
deduplicateSimilarURLs: z.boolean().default(true), deduplicateSimilarURLs: z.boolean().default(true),
ignoreQueryParameters: z.boolean().default(false), ignoreQueryParameters: z.boolean().default(false),
@ -502,6 +503,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
generateImgAltText: false, generateImgAltText: false,
allowBackwardCrawling: x.allowBackwardLinks, allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks, allowExternalContentLinks: x.allowExternalLinks,
allowSubdomains: x.allowSubdomains,
ignoreSitemap: x.ignoreSitemap, ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs, deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters, ignoreQueryParameters: x.ignoreQueryParameters,
@ -517,6 +519,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
maxDepth: x.maxDepth, maxDepth: x.maxDepth,
allowBackwardLinks: x.allowBackwardCrawling, allowBackwardLinks: x.allowBackwardCrawling,
allowExternalLinks: x.allowExternalContentLinks, allowExternalLinks: x.allowExternalContentLinks,
allowSubdomains: x.allowSubdomains,
ignoreSitemap: x.ignoreSitemap, ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs, deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters, ignoreQueryParameters: x.ignoreQueryParameters,

View File

@ -148,7 +148,8 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
} else { } else {
const permutations = generateURLPermutations(url); const permutations = generateURLPermutations(url);
res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length; const x = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href)));
res = x === permutations.length;
} }
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
@ -179,6 +180,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false, generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false, allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false, allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
}); });
if (sc.robots !== undefined) { if (sc.robots !== undefined) {

View File

@ -23,6 +23,7 @@ export class WebCrawler {
private generateImgAltText: boolean; private generateImgAltText: boolean;
private allowBackwardCrawling: boolean; private allowBackwardCrawling: boolean;
private allowExternalContentLinks: boolean; private allowExternalContentLinks: boolean;
private allowSubdomains: boolean;
constructor({ constructor({
jobId, jobId,
@ -35,7 +36,8 @@ export class WebCrawler {
generateImgAltText = false, generateImgAltText = false,
maxCrawledDepth = 10, maxCrawledDepth = 10,
allowBackwardCrawling = false, allowBackwardCrawling = false,
allowExternalContentLinks = false allowExternalContentLinks = false,
allowSubdomains = false,
}: { }: {
jobId: string; jobId: string;
initialUrl: string; initialUrl: string;
@ -48,6 +50,7 @@ export class WebCrawler {
maxCrawledDepth?: number; maxCrawledDepth?: number;
allowBackwardCrawling?: boolean; allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean; allowExternalContentLinks?: boolean;
allowSubdomains?: boolean;
}) { }) {
this.jobId = jobId; this.jobId = jobId;
this.initialUrl = initialUrl; this.initialUrl = initialUrl;
@ -63,6 +66,7 @@ export class WebCrawler {
this.generateImgAltText = generateImgAltText ?? false; this.generateImgAltText = generateImgAltText ?? false;
this.allowBackwardCrawling = allowBackwardCrawling ?? false; this.allowBackwardCrawling = allowBackwardCrawling ?? false;
this.allowExternalContentLinks = allowExternalContentLinks ?? false; this.allowExternalContentLinks = allowExternalContentLinks ?? false;
this.allowSubdomains = allowSubdomains ?? false;
} }
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] { public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
@ -214,6 +218,10 @@ export class WebCrawler {
} }
} }
if (this.allowSubdomains && !this.isSocialMediaOrEmail(fullUrl) && this.isSubdomain(fullUrl)) {
return fullUrl;
}
return null; return null;
} }
@ -222,8 +230,11 @@ export class WebCrawler {
const $ = load(html); const $ = load(html);
$("a").each((_, element) => { $("a").each((_, element) => {
const href = $(element).attr("href"); let href = $(element).attr("href");
if (href) { if (href) {
if (href.match(/^https?:\/[^\/]/)) {
href = href.replace(/^https?:\//, "$&/");
}
const u = this.filterURL(href, url); const u = this.filterURL(href, url);
if (u !== null) { if (u !== null) {
links.push(u); links.push(u);
@ -297,6 +308,10 @@ export class WebCrawler {
return linkDomain === baseDomain; return linkDomain === baseDomain;
} }
private isSubdomain(link: string): boolean {
return new URL(link, this.baseUrl).hostname.endsWith("." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."));
}
public isFile(url: string): boolean { public isFile(url: string): boolean {
const fileExtensions = [ const fileExtensions = [
".png", ".png",

View File

@ -350,12 +350,12 @@ async function processJob(job: Job & { id: string }, token: string) {
await addCrawlJobDone(job.data.crawl_id, job.id); await addCrawlJobDone(job.data.crawl_id, job.id);
if (!job.data.sitemapped && job.data.crawlerOptions !== null) { if (job.data.crawlerOptions !== null) {
if (!sc.cancelled) { if (!sc.cancelled) {
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl); const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!);
const links = crawler.filterLinks( const links = crawler.filterLinks(
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string), crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!),
Infinity, Infinity,
sc.crawlerOptions?.maxDepth ?? 10 sc.crawlerOptions?.maxDepth ?? 10
); );

View File

@ -160,6 +160,7 @@ const testSuiteTokens = [
"6c46abb", "6c46abb",
"cb0ff78", "cb0ff78",
"fd769b2", "fd769b2",
"4c2638d",
"cbb3462", // don't remove (s-ai) "cbb3462", // don't remove (s-ai)
"824abcd" // don't remove (s-ai) "824abcd" // don't remove (s-ai)
]; ];

File diff suppressed because one or more lines are too long