mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-09-26 17:01:27 +00:00
Merge branch 'main' into nsc/new-extract
This commit is contained in:
commit
c78dae178b
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute)
|
Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute)
|
||||||
|
|
||||||
If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to hello@mendable.ai for more or submit an issue!
|
If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to help@firecrawl.com for more or submit an issue!
|
||||||
|
|
||||||
## Running the project locally
|
## Running the project locally
|
||||||
|
|
||||||
|
@ -77,10 +77,10 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
|
|||||||
- **LLM-ready formats**: markdown, structured data, screenshot, HTML, links, metadata
|
- **LLM-ready formats**: markdown, structured data, screenshot, HTML, links, metadata
|
||||||
- **The hard stuff**: proxies, anti-bot mechanisms, dynamic content (js-rendered), output parsing, orchestration
|
- **The hard stuff**: proxies, anti-bot mechanisms, dynamic content (js-rendered), output parsing, orchestration
|
||||||
- **Customizability**: exclude tags, crawl behind auth walls with custom headers, max crawl depth, etc...
|
- **Customizability**: exclude tags, crawl behind auth walls with custom headers, max crawl depth, etc...
|
||||||
- **Media parsing**: pdfs, docx, images.
|
- **Media parsing**: pdfs, docx, images
|
||||||
- **Reliability first**: designed to get the data you need - no matter how hard it is.
|
- **Reliability first**: designed to get the data you need - no matter how hard it is
|
||||||
- **Actions**: click, scroll, input, wait and more before extracting data
|
- **Actions**: click, scroll, input, wait and more before extracting data
|
||||||
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint
|
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint.
|
||||||
|
|
||||||
You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)
|
You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)
|
||||||
|
|
||||||
|
@ -230,6 +230,7 @@ const crawlerOptions = z.object({
|
|||||||
limit: z.number().default(10000), // default?
|
limit: z.number().default(10000), // default?
|
||||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||||
allowExternalLinks: z.boolean().default(false),
|
allowExternalLinks: z.boolean().default(false),
|
||||||
|
allowSubdomains: z.boolean().default(false),
|
||||||
ignoreSitemap: z.boolean().default(true),
|
ignoreSitemap: z.boolean().default(true),
|
||||||
deduplicateSimilarURLs: z.boolean().default(true),
|
deduplicateSimilarURLs: z.boolean().default(true),
|
||||||
ignoreQueryParameters: z.boolean().default(false),
|
ignoreQueryParameters: z.boolean().default(false),
|
||||||
@ -502,6 +503,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
|||||||
generateImgAltText: false,
|
generateImgAltText: false,
|
||||||
allowBackwardCrawling: x.allowBackwardLinks,
|
allowBackwardCrawling: x.allowBackwardLinks,
|
||||||
allowExternalContentLinks: x.allowExternalLinks,
|
allowExternalContentLinks: x.allowExternalLinks,
|
||||||
|
allowSubdomains: x.allowSubdomains,
|
||||||
ignoreSitemap: x.ignoreSitemap,
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||||
@ -517,6 +519,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
|
|||||||
maxDepth: x.maxDepth,
|
maxDepth: x.maxDepth,
|
||||||
allowBackwardLinks: x.allowBackwardCrawling,
|
allowBackwardLinks: x.allowBackwardCrawling,
|
||||||
allowExternalLinks: x.allowExternalContentLinks,
|
allowExternalLinks: x.allowExternalContentLinks,
|
||||||
|
allowSubdomains: x.allowSubdomains,
|
||||||
ignoreSitemap: x.ignoreSitemap,
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||||
|
@ -148,7 +148,8 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
|
|||||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
|
||||||
} else {
|
} else {
|
||||||
const permutations = generateURLPermutations(url);
|
const permutations = generateURLPermutations(url);
|
||||||
res = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href))) === permutations.length;
|
const x = (await redisConnection.sadd("crawl:" + id + ":visited", ...permutations.map(x => x.href)));
|
||||||
|
res = x === permutations.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
|
||||||
@ -179,6 +180,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
|
|||||||
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
generateImgAltText: sc.crawlerOptions?.generateImgAltText ?? false,
|
||||||
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
||||||
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||||
|
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (sc.robots !== undefined) {
|
if (sc.robots !== undefined) {
|
||||||
|
@ -23,6 +23,7 @@ export class WebCrawler {
|
|||||||
private generateImgAltText: boolean;
|
private generateImgAltText: boolean;
|
||||||
private allowBackwardCrawling: boolean;
|
private allowBackwardCrawling: boolean;
|
||||||
private allowExternalContentLinks: boolean;
|
private allowExternalContentLinks: boolean;
|
||||||
|
private allowSubdomains: boolean;
|
||||||
|
|
||||||
constructor({
|
constructor({
|
||||||
jobId,
|
jobId,
|
||||||
@ -35,7 +36,8 @@ export class WebCrawler {
|
|||||||
generateImgAltText = false,
|
generateImgAltText = false,
|
||||||
maxCrawledDepth = 10,
|
maxCrawledDepth = 10,
|
||||||
allowBackwardCrawling = false,
|
allowBackwardCrawling = false,
|
||||||
allowExternalContentLinks = false
|
allowExternalContentLinks = false,
|
||||||
|
allowSubdomains = false,
|
||||||
}: {
|
}: {
|
||||||
jobId: string;
|
jobId: string;
|
||||||
initialUrl: string;
|
initialUrl: string;
|
||||||
@ -48,6 +50,7 @@ export class WebCrawler {
|
|||||||
maxCrawledDepth?: number;
|
maxCrawledDepth?: number;
|
||||||
allowBackwardCrawling?: boolean;
|
allowBackwardCrawling?: boolean;
|
||||||
allowExternalContentLinks?: boolean;
|
allowExternalContentLinks?: boolean;
|
||||||
|
allowSubdomains?: boolean;
|
||||||
}) {
|
}) {
|
||||||
this.jobId = jobId;
|
this.jobId = jobId;
|
||||||
this.initialUrl = initialUrl;
|
this.initialUrl = initialUrl;
|
||||||
@ -63,6 +66,7 @@ export class WebCrawler {
|
|||||||
this.generateImgAltText = generateImgAltText ?? false;
|
this.generateImgAltText = generateImgAltText ?? false;
|
||||||
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
|
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
|
||||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||||
|
this.allowSubdomains = allowSubdomains ?? false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
||||||
@ -214,6 +218,10 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.allowSubdomains && !this.isSocialMediaOrEmail(fullUrl) && this.isSubdomain(fullUrl)) {
|
||||||
|
return fullUrl;
|
||||||
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -222,8 +230,11 @@ export class WebCrawler {
|
|||||||
|
|
||||||
const $ = load(html);
|
const $ = load(html);
|
||||||
$("a").each((_, element) => {
|
$("a").each((_, element) => {
|
||||||
const href = $(element).attr("href");
|
let href = $(element).attr("href");
|
||||||
if (href) {
|
if (href) {
|
||||||
|
if (href.match(/^https?:\/[^\/]/)) {
|
||||||
|
href = href.replace(/^https?:\//, "$&/");
|
||||||
|
}
|
||||||
const u = this.filterURL(href, url);
|
const u = this.filterURL(href, url);
|
||||||
if (u !== null) {
|
if (u !== null) {
|
||||||
links.push(u);
|
links.push(u);
|
||||||
@ -297,6 +308,10 @@ export class WebCrawler {
|
|||||||
return linkDomain === baseDomain;
|
return linkDomain === baseDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private isSubdomain(link: string): boolean {
|
||||||
|
return new URL(link, this.baseUrl).hostname.endsWith("." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."));
|
||||||
|
}
|
||||||
|
|
||||||
public isFile(url: string): boolean {
|
public isFile(url: string): boolean {
|
||||||
const fileExtensions = [
|
const fileExtensions = [
|
||||||
".png",
|
".png",
|
||||||
|
@ -350,12 +350,12 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
|
|
||||||
await addCrawlJobDone(job.data.crawl_id, job.id);
|
await addCrawlJobDone(job.data.crawl_id, job.id);
|
||||||
|
|
||||||
if (!job.data.sitemapped && job.data.crawlerOptions !== null) {
|
if (job.data.crawlerOptions !== null) {
|
||||||
if (!sc.cancelled) {
|
if (!sc.cancelled) {
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!);
|
||||||
|
|
||||||
const links = crawler.filterLinks(
|
const links = crawler.filterLinks(
|
||||||
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl as string),
|
crawler.extractLinksFromHTML(rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!),
|
||||||
Infinity,
|
Infinity,
|
||||||
sc.crawlerOptions?.maxDepth ?? 10
|
sc.crawlerOptions?.maxDepth ?? 10
|
||||||
);
|
);
|
||||||
|
@ -160,6 +160,7 @@ const testSuiteTokens = [
|
|||||||
"6c46abb",
|
"6c46abb",
|
||||||
"cb0ff78",
|
"cb0ff78",
|
||||||
"fd769b2",
|
"fd769b2",
|
||||||
|
"4c2638d",
|
||||||
"cbb3462", // don't remove (s-ai)
|
"cbb3462", // don't remove (s-ai)
|
||||||
"824abcd" // don't remove (s-ai)
|
"824abcd" // don't remove (s-ai)
|
||||||
];
|
];
|
||||||
|
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user