| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  | import axios, { AxiosError } from "axios"; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  | import cheerio, { load } from "cheerio"; | 
					
						
							|  |  |  | import { URL } from "url"; | 
					
						
							|  |  |  | import { getLinksFromSitemap } from "./sitemap"; | 
					
						
							|  |  |  | import async from "async"; | 
					
						
							| 
									
										
										
										
											2024-06-10 18:12:41 -07:00
										 |  |  | import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities"; | 
					
						
							| 
									
										
										
										
											2024-07-03 18:01:17 -03:00
										 |  |  | import { scrapSingleUrl } from "./single_url"; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  | import robotsParser from "robots-parser"; | 
					
						
							| 
									
										
										
										
											2024-06-15 16:43:37 -04:00
										 |  |  | import { getURLDepth } from "./utils/maxDepthUtils"; | 
					
						
							| 
									
										
										
										
											2024-06-24 16:33:07 -03:00
										 |  |  | import { axiosTimeout } from "../../../src/lib/timeout"; | 
					
						
							| 
									
										
										
										
											2024-07-23 17:30:46 -03:00
										 |  |  | import { Logger } from "../../../src/lib/logger"; | 
					
						
							| 
									
										
										
										
											2024-10-23 01:07:03 +03:00
										 |  |  | import https from "https"; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  | export class WebCrawler { | 
					
						
							| 
									
										
										
										
											2024-07-24 14:31:25 +02:00
										 |  |  |   private jobId: string; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   private initialUrl: string; | 
					
						
							|  |  |  |   private baseUrl: string; | 
					
						
							|  |  |  |   private includes: string[]; | 
					
						
							|  |  |  |   private excludes: string[]; | 
					
						
							|  |  |  |   private maxCrawledLinks: number; | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  |   private maxCrawledDepth: number; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   private visited: Set<string> = new Set(); | 
					
						
							| 
									
										
										
										
											2024-05-14 12:12:40 -07:00
										 |  |  |   private crawledUrls: Map<string, string> = new Map(); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   private limit: number; | 
					
						
							|  |  |  |   private robotsTxtUrl: string; | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |   public robots: any; | 
					
						
							| 
									
										
										
										
											2024-04-16 12:49:14 -04:00
										 |  |  |   private generateImgAltText: boolean; | 
					
						
							| 
									
										
										
										
											2024-06-11 15:24:39 -03:00
										 |  |  |   private allowBackwardCrawling: boolean; | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |   private allowExternalContentLinks: boolean; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |   constructor({ | 
					
						
							| 
									
										
										
										
											2024-07-24 14:31:25 +02:00
										 |  |  |     jobId, | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     initialUrl, | 
					
						
							|  |  |  |     includes, | 
					
						
							|  |  |  |     excludes, | 
					
						
							| 
									
										
										
										
											2024-05-10 12:15:54 -03:00
										 |  |  |     maxCrawledLinks = 10000, | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     limit = 10000, | 
					
						
							| 
									
										
										
										
											2024-04-16 12:49:14 -04:00
										 |  |  |     generateImgAltText = false, | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  |     maxCrawledDepth = 10, | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |     allowBackwardCrawling = false, | 
					
						
							|  |  |  |     allowExternalContentLinks = false | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   }: { | 
					
						
							| 
									
										
										
										
											2024-07-24 14:31:25 +02:00
										 |  |  |     jobId: string; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     initialUrl: string; | 
					
						
							|  |  |  |     includes?: string[]; | 
					
						
							|  |  |  |     excludes?: string[]; | 
					
						
							|  |  |  |     maxCrawledLinks?: number; | 
					
						
							|  |  |  |     limit?: number; | 
					
						
							| 
									
										
										
										
											2024-04-16 12:49:14 -04:00
										 |  |  |     generateImgAltText?: boolean; | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  |     maxCrawledDepth?: number; | 
					
						
							| 
									
										
										
										
											2024-06-11 15:24:39 -03:00
										 |  |  |     allowBackwardCrawling?: boolean; | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |     allowExternalContentLinks?: boolean; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   }) { | 
					
						
							| 
									
										
										
										
											2024-07-24 14:31:25 +02:00
										 |  |  |     this.jobId = jobId; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     this.initialUrl = initialUrl; | 
					
						
							|  |  |  |     this.baseUrl = new URL(initialUrl).origin; | 
					
						
							| 
									
										
										
										
											2024-08-22 13:18:26 +02:00
										 |  |  |     this.includes = Array.isArray(includes) ? includes : []; | 
					
						
							|  |  |  |     this.excludes = Array.isArray(excludes) ? excludes : []; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     this.limit = limit; | 
					
						
							|  |  |  |     this.robotsTxtUrl = `${this.baseUrl}/robots.txt`; | 
					
						
							|  |  |  |     this.robots = robotsParser(this.robotsTxtUrl, ""); | 
					
						
							|  |  |  |     // Deprecated, use limit instead
 | 
					
						
							|  |  |  |     this.maxCrawledLinks = maxCrawledLinks ?? limit; | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  |     this.maxCrawledDepth = maxCrawledDepth ?? 10; | 
					
						
							| 
									
										
										
										
											2024-04-16 12:49:14 -04:00
										 |  |  |     this.generateImgAltText = generateImgAltText ?? false; | 
					
						
							| 
									
										
										
										
											2024-06-11 15:24:39 -03:00
										 |  |  |     this.allowBackwardCrawling = allowBackwardCrawling ?? false; | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |     this.allowExternalContentLinks = allowExternalContentLinks ?? false; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |   public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     return sitemapLinks | 
					
						
							|  |  |  |       .filter((link) => { | 
					
						
							| 
									
										
										
										
											2024-08-20 09:11:58 -03:00
										 |  |  |         let url: URL; | 
					
						
							|  |  |  |         try { | 
					
						
							|  |  |  |           url = new URL(link.trim(), this.baseUrl); | 
					
						
							|  |  |  |         } catch (error) { | 
					
						
							|  |  |  |           Logger.debug(`Error processing link: ${link} | Error: ${error.message}`); | 
					
						
							|  |  |  |           return false; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |         const path = url.pathname; | 
					
						
							| 
									
										
										
										
											2024-06-14 19:40:37 -04:00
										 |  |  |          | 
					
						
							| 
									
										
										
										
											2024-06-15 16:43:37 -04:00
										 |  |  |         const depth = getURLDepth(url.toString()); | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-14 19:40:37 -04:00
										 |  |  |          | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  |         // Check if the link exceeds the maximum depth allowed
 | 
					
						
							|  |  |  |         if (depth > maxDepth) { | 
					
						
							|  |  |  |           return false; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |         // Check if the link should be excluded
 | 
					
						
							|  |  |  |         if (this.excludes.length > 0 && this.excludes[0] !== "") { | 
					
						
							|  |  |  |           if ( | 
					
						
							|  |  |  |             this.excludes.some((excludePattern) => | 
					
						
							|  |  |  |               new RegExp(excludePattern).test(path) | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |           ) { | 
					
						
							|  |  |  |             return false; | 
					
						
							|  |  |  |           } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Check if the link matches the include patterns, if any are specified
 | 
					
						
							|  |  |  |         if (this.includes.length > 0 && this.includes[0] !== "") { | 
					
						
							| 
									
										
										
										
											2024-05-15 15:30:37 -07:00
										 |  |  |           if (!this.includes.some((includePattern) => | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |             new RegExp(includePattern).test(path) | 
					
						
							| 
									
										
										
										
											2024-05-15 15:30:37 -07:00
										 |  |  |           )) { | 
					
						
							|  |  |  |             return false; | 
					
						
							|  |  |  |           } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Normalize the initial URL and the link to account for www and non-www versions
 | 
					
						
							|  |  |  |         const normalizedInitialUrl = new URL(this.initialUrl); | 
					
						
							| 
									
										
										
										
											2024-08-22 23:30:19 +02:00
										 |  |  |         let normalizedLink; | 
					
						
							|  |  |  |         try { | 
					
						
							|  |  |  |           normalizedLink = new URL(link); | 
					
						
							|  |  |  |         } catch (_) { | 
					
						
							|  |  |  |           return false; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-05-15 15:30:37 -07:00
										 |  |  |         const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, ''); | 
					
						
							|  |  |  |         const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Ensure the protocol and hostname match, and the path starts with the initial URL's path
 | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |         // commented to able to handling external link on allowExternalContentLinks
 | 
					
						
							|  |  |  |         // if (linkHostname !== initialHostname) {
 | 
					
						
							|  |  |  |         //   return false;
 | 
					
						
							|  |  |  |         // }
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-11 15:24:39 -03:00
										 |  |  |         if (!this.allowBackwardCrawling) { | 
					
						
							|  |  |  |           if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { | 
					
						
							|  |  |  |             return false; | 
					
						
							|  |  |  |           } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |         const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; | 
					
						
							|  |  |  |         // Check if the link is disallowed by robots.txt
 | 
					
						
							|  |  |  |         if (!isAllowed) { | 
					
						
							| 
									
										
										
										
											2024-07-23 17:30:46 -03:00
										 |  |  |           Logger.debug(`Link disallowed by robots.txt: ${link}`); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |           return false; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-14 15:44:45 -03:00
										 |  |  |         if (this.isFile(link)) { | 
					
						
							|  |  |  |           return false; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |         return true; | 
					
						
							|  |  |  |       }) | 
					
						
							|  |  |  |       .slice(0, limit); | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-23 01:07:03 +03:00
										 |  |  |   public async getRobotsTxt(skipTlsVerification = false): Promise<string> { | 
					
						
							|  |  |  |     let extraArgs = {}; | 
					
						
							|  |  |  |     if(skipTlsVerification) { | 
					
						
							|  |  |  |       extraArgs["httpsAgent"] = new https.Agent({ | 
					
						
							|  |  |  |         rejectUnauthorized: false | 
					
						
							|  |  |  |       }); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs }); | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |     return response.data; | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   public importRobotsTxt(txt: string) { | 
					
						
							|  |  |  |     this.robots = robotsParser(this.robotsTxtUrl, txt); | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> { | 
					
						
							|  |  |  |     Logger.debug(`Fetching sitemap links from ${this.initialUrl}`); | 
					
						
							|  |  |  |     const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); | 
					
						
							|  |  |  |     if (sitemapLinks.length > 0) { | 
					
						
							|  |  |  |       let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth); | 
					
						
							|  |  |  |       return filteredLinks.map(link => ({ url: link, html: "" })); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return null; | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   public async start( | 
					
						
							|  |  |  |     inProgress?: (progress: Progress) => void, | 
					
						
							| 
									
										
										
										
											2024-06-10 16:27:10 -07:00
										 |  |  |     pageOptions?: PageOptions, | 
					
						
							| 
									
										
										
										
											2024-06-10 18:12:41 -07:00
										 |  |  |     crawlerOptions?: CrawlerOptions, | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     concurrencyLimit: number = 5, | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  |     limit: number = 10000, | 
					
						
							|  |  |  |     maxDepth: number = 10 | 
					
						
							| 
									
										
										
										
											2024-05-13 20:45:11 -07:00
										 |  |  |   ): Promise<{ url: string, html: string }[]> { | 
					
						
							| 
									
										
										
										
											2024-07-23 17:30:46 -03:00
										 |  |  | 
 | 
					
						
							|  |  |  |     Logger.debug(`Crawler starting with ${this.initialUrl}`); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     // Fetch and parse robots.txt
 | 
					
						
							|  |  |  |     try { | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |       const txt = await this.getRobotsTxt(); | 
					
						
							|  |  |  |       this.importRobotsTxt(txt); | 
					
						
							| 
									
										
										
										
											2024-07-23 17:30:46 -03:00
										 |  |  |       Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     } catch (error) { | 
					
						
							| 
									
										
										
										
											2024-07-23 17:30:46 -03:00
										 |  |  |       Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-07-23 17:30:46 -03:00
										 |  |  |     if (!crawlerOptions?.ignoreSitemap){ | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |       const sm = await this.tryGetSitemap(); | 
					
						
							|  |  |  |       if (sm !== null) { | 
					
						
							|  |  |  |         return sm; | 
					
						
							| 
									
										
										
										
											2024-06-10 18:12:41 -07:00
										 |  |  |       } | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     const urls = await this.crawlUrls( | 
					
						
							|  |  |  |       [this.initialUrl], | 
					
						
							| 
									
										
										
										
											2024-06-10 16:27:10 -07:00
										 |  |  |       pageOptions, | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       concurrencyLimit, | 
					
						
							|  |  |  |       inProgress | 
					
						
							|  |  |  |     ); | 
					
						
							| 
									
										
										
										
											2024-07-31 09:28:43 -03:00
										 |  |  |      | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     if ( | 
					
						
							|  |  |  |       urls.length === 0 && | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  |       this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     ) { | 
					
						
							| 
									
										
										
										
											2024-05-13 20:45:11 -07:00
										 |  |  |       return [{ url: this.initialUrl, html: "" }]; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     // make sure to run include exclude here again
 | 
					
						
							| 
									
										
										
										
											2024-05-13 20:45:11 -07:00
										 |  |  |     const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); | 
					
						
							|  |  |  |     return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   private async crawlUrls( | 
					
						
							|  |  |  |     urls: string[], | 
					
						
							| 
									
										
										
										
											2024-06-10 16:27:10 -07:00
										 |  |  |     pageOptions: PageOptions, | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     concurrencyLimit: number, | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |     inProgress?: (progress: Progress) => void, | 
					
						
							| 
									
										
										
										
											2024-05-13 20:45:11 -07:00
										 |  |  |   ): Promise<{ url: string, html: string }[]> { | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     const queue = async.queue(async (task: string, callback) => { | 
					
						
							| 
									
										
										
										
											2024-07-23 17:30:46 -03:00
										 |  |  |       Logger.debug(`Crawling ${task}`); | 
					
						
							| 
									
										
										
										
											2024-05-10 12:15:54 -03:00
										 |  |  |       if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) { | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |         if (callback && typeof callback === "function") { | 
					
						
							|  |  |  |           callback(); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |         return; | 
					
						
							|  |  |  |       } | 
					
						
							| 
									
										
										
										
											2024-06-10 16:27:10 -07:00
										 |  |  |       const newUrls = await this.crawl(task, pageOptions); | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |       // add the initial url if not already added
 | 
					
						
							|  |  |  |       // if (this.visited.size === 1) {
 | 
					
						
							|  |  |  |       //   let normalizedInitial = this.initialUrl;
 | 
					
						
							|  |  |  |       //   if (!normalizedInitial.endsWith("/")) {
 | 
					
						
							|  |  |  |       //     normalizedInitial = normalizedInitial + "/";
 | 
					
						
							|  |  |  |       //   }
 | 
					
						
							|  |  |  |       //   if (!newUrls.some(page => page.url === this.initialUrl)) {
 | 
					
						
							|  |  |  |       //     newUrls.push({ url: this.initialUrl, html: "" });
 | 
					
						
							|  |  |  |       //   }
 | 
					
						
							|  |  |  |       // }
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-14 12:12:40 -07:00
										 |  |  |       newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html)); | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |        | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       if (inProgress && newUrls.length > 0) { | 
					
						
							|  |  |  |         inProgress({ | 
					
						
							|  |  |  |           current: this.crawledUrls.size, | 
					
						
							| 
									
										
										
										
											2024-05-10 12:15:54 -03:00
										 |  |  |           total: Math.min(this.maxCrawledLinks, this.limit), | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |           status: "SCRAPING", | 
					
						
							| 
									
										
										
										
											2024-05-13 20:45:11 -07:00
										 |  |  |           currentDocumentUrl: newUrls[newUrls.length - 1].url, | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |         }); | 
					
						
							|  |  |  |       } else if (inProgress) { | 
					
						
							|  |  |  |         inProgress({ | 
					
						
							|  |  |  |           current: this.crawledUrls.size, | 
					
						
							| 
									
										
										
										
											2024-05-10 12:15:54 -03:00
										 |  |  |           total: Math.min(this.maxCrawledLinks, this.limit), | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |           status: "SCRAPING", | 
					
						
							|  |  |  |           currentDocumentUrl: task, | 
					
						
							|  |  |  |         }); | 
					
						
							|  |  |  |       } | 
					
						
							| 
									
										
										
										
											2024-06-10 16:27:10 -07:00
										 |  |  |       await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       if (callback && typeof callback === "function") { | 
					
						
							|  |  |  |         callback(); | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |     }, concurrencyLimit); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-07-23 17:30:46 -03:00
										 |  |  |     Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     queue.push( | 
					
						
							|  |  |  |       urls.filter( | 
					
						
							|  |  |  |         (url) => | 
					
						
							|  |  |  |           !this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent") | 
					
						
							|  |  |  |       ), | 
					
						
							|  |  |  |       (err) => { | 
					
						
							| 
									
										
										
										
											2024-07-23 17:30:46 -03:00
										 |  |  |         if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       } | 
					
						
							|  |  |  |     ); | 
					
						
							|  |  |  |     await queue.drain(); | 
					
						
							| 
									
										
										
										
											2024-07-23 17:30:46 -03:00
										 |  |  |     Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`); | 
					
						
							| 
									
										
										
										
											2024-05-14 12:12:40 -07:00
										 |  |  |     return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |   public filterURL(href: string, url: string): string | null { | 
					
						
							|  |  |  |     let fullUrl = href; | 
					
						
							|  |  |  |     if (!href.startsWith("http")) { | 
					
						
							| 
									
										
										
										
											2024-08-21 20:49:25 +02:00
										 |  |  |       try { | 
					
						
							|  |  |  |         fullUrl = new URL(href, this.baseUrl).toString(); | 
					
						
							|  |  |  |       } catch (_) { | 
					
						
							|  |  |  |         return null; | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     let urlObj; | 
					
						
							|  |  |  |     try { | 
					
						
							|  |  |  |       urlObj = new URL(fullUrl); | 
					
						
							|  |  |  |     } catch (_) { | 
					
						
							|  |  |  |       return null; | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |     } | 
					
						
							|  |  |  |     const path = urlObj.pathname; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
 | 
					
						
							|  |  |  |       if (this.isInternalLink(fullUrl) && | 
					
						
							|  |  |  |         this.noSections(fullUrl) && | 
					
						
							|  |  |  |         !this.matchesExcludes(path) && | 
					
						
							|  |  |  |         this.isRobotsAllowed(fullUrl) | 
					
						
							|  |  |  |       ) { | 
					
						
							|  |  |  |         return fullUrl; | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |     } else { // EXTERNAL LINKS
 | 
					
						
							|  |  |  |       if ( | 
					
						
							|  |  |  |         this.isInternalLink(url) && | 
					
						
							|  |  |  |         this.allowExternalContentLinks && | 
					
						
							|  |  |  |         !this.isSocialMediaOrEmail(fullUrl) && | 
					
						
							|  |  |  |         !this.matchesExcludes(fullUrl, true) && | 
					
						
							|  |  |  |         !this.isExternalMainPage(fullUrl) | 
					
						
							|  |  |  |       ) { | 
					
						
							|  |  |  |         return fullUrl; | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return null; | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-16 23:29:30 +02:00
										 |  |  |   public extractLinksFromHTML(html: string, url: string) { | 
					
						
							|  |  |  |     let links: string[] = []; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     const $ = load(html); | 
					
						
							|  |  |  |     $("a").each((_, element) => { | 
					
						
							|  |  |  |       const href = $(element).attr("href"); | 
					
						
							|  |  |  |       if (href) { | 
					
						
							|  |  |  |         const u = this.filterURL(href, url); | 
					
						
							|  |  |  |         if (u !== null) { | 
					
						
							|  |  |  |           links.push(u); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |     }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return links; | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-13 17:08:40 -03:00
										 |  |  |   async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> { | 
					
						
							| 
									
										
										
										
											2024-06-14 13:44:54 -07:00
										 |  |  |     if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) { | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       return []; | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-06-14 13:44:54 -07:00
										 |  |  |     this.visited.add(url); | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     if (!url.startsWith("http")) { | 
					
						
							|  |  |  |       url = "https://" + url; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     if (url.endsWith("/")) { | 
					
						
							|  |  |  |       url = url.slice(0, -1); | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-06-10 18:12:41 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     if (this.isFile(url) || this.isSocialMediaOrEmail(url)) { | 
					
						
							|  |  |  |       return []; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     try { | 
					
						
							| 
									
										
										
										
											2024-06-10 18:12:41 -07:00
										 |  |  |       let content: string = ""; | 
					
						
							| 
									
										
										
										
											2024-06-13 17:08:40 -03:00
										 |  |  |       let pageStatusCode: number; | 
					
						
							|  |  |  |       let pageError: string | undefined = undefined; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-13 20:45:11 -07:00
										 |  |  |       // If it is the first link, fetch with single url
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       if (this.visited.size === 1) { | 
					
						
							| 
									
										
										
										
											2024-07-24 14:31:25 +02:00
										 |  |  |         const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true }); | 
					
						
							| 
									
										
										
										
											2024-06-10 18:12:41 -07:00
										 |  |  |         content = page.html ?? ""; | 
					
						
							| 
									
										
										
										
											2024-06-13 17:08:40 -03:00
										 |  |  |         pageStatusCode = page.metadata?.pageStatusCode; | 
					
						
							|  |  |  |         pageError = page.metadata?.pageError || undefined; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       } else { | 
					
						
							| 
									
										
										
										
											2024-06-24 16:33:07 -03:00
										 |  |  |         const response = await axios.get(url, { timeout: axiosTimeout }); | 
					
						
							| 
									
										
										
										
											2024-05-13 20:45:11 -07:00
										 |  |  |         content = response.data ?? ""; | 
					
						
							| 
									
										
										
										
											2024-06-13 17:08:40 -03:00
										 |  |  |         pageStatusCode = response.status; | 
					
						
							|  |  |  |         pageError = response.statusText != "OK" ? response.statusText : undefined; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       } | 
					
						
							| 
									
										
										
										
											2024-06-24 16:25:07 -03:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       const $ = load(content); | 
					
						
							| 
									
										
										
										
											2024-06-13 17:08:40 -03:00
										 |  |  |       let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = []; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |       // Add the initial URL to the list of links
 | 
					
						
							| 
									
										
										
										
											2024-06-10 18:12:41 -07:00
										 |  |  |       if (this.visited.size === 1) { | 
					
						
							| 
									
										
										
										
											2024-06-13 17:08:40 -03:00
										 |  |  |         links.push({ url, html: content, pageStatusCode, pageError }); | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |       } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-16 23:29:30 +02:00
										 |  |  |       links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError }))); | 
					
						
							| 
									
										
										
										
											2024-06-24 16:43:12 -03:00
										 |  |  |        | 
					
						
							| 
									
										
										
										
											2024-06-10 18:12:41 -07:00
										 |  |  |       if (this.visited.size === 1) { | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |         return links; | 
					
						
							|  |  |  |       } | 
					
						
							| 
									
										
										
										
											2024-06-24 16:25:07 -03:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-13 20:45:11 -07:00
										 |  |  |       // Create a new list to return to avoid modifying the visited list
 | 
					
						
							| 
									
										
										
										
											2024-06-14 13:44:54 -07:00
										 |  |  |       return links.filter((link) => !this.visited.has(link.url)); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     } catch (error) { | 
					
						
							|  |  |  |       return []; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-14 13:44:54 -07:00
										 |  |  |   private isRobotsAllowed(url: string): boolean { | 
					
						
							|  |  |  |     return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true) | 
					
						
							|  |  |  |   } | 
					
						
							| 
									
										
										
										
											2024-06-10 18:12:41 -07:00
										 |  |  |   private normalizeCrawlUrl(url: string): string { | 
					
						
							|  |  |  |     try{ | 
					
						
							|  |  |  |       const urlObj = new URL(url); | 
					
						
							|  |  |  |       urlObj.searchParams.sort(); // Sort query parameters to normalize
 | 
					
						
							|  |  |  |       return urlObj.toString(); | 
					
						
							|  |  |  |     } catch (error) { | 
					
						
							|  |  |  |       return url; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   private matchesIncludes(url: string): boolean { | 
					
						
							|  |  |  |     if (this.includes.length === 0 || this.includes[0] == "") return true; | 
					
						
							|  |  |  |     return this.includes.some((pattern) => new RegExp(pattern).test(url)); | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |   private matchesExcludes(url: string, onlyDomains: boolean = false): boolean { | 
					
						
							|  |  |  |     return this.excludes.some((pattern) => { | 
					
						
							|  |  |  |       if (onlyDomains) | 
					
						
							|  |  |  |         return this.matchesExcludesExternalDomains(url); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       return this.excludes.some((pattern) => new RegExp(pattern).test(url)); | 
					
						
							|  |  |  |     }); | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   // supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com"
 | 
					
						
							|  |  |  |   private matchesExcludesExternalDomains(url: string) { | 
					
						
							|  |  |  |     try { | 
					
						
							|  |  |  |       const urlObj = new URL(url); | 
					
						
							|  |  |  |       const hostname = urlObj.hostname; | 
					
						
							|  |  |  |       const pathname = urlObj.pathname; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       for (let domain of this.excludes) { | 
					
						
							|  |  |  |         let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, '')); | 
					
						
							|  |  |  |         let domainHostname = domainObj.hostname; | 
					
						
							|  |  |  |         let domainPathname = domainObj.pathname; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) { | 
					
						
							|  |  |  |           if (pathname.startsWith(domainPathname)) { | 
					
						
							|  |  |  |             return true; | 
					
						
							|  |  |  |           } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |       return false; | 
					
						
							|  |  |  |     } catch (e) { | 
					
						
							|  |  |  |       return false; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   private isExternalMainPage(url:string):boolean { | 
					
						
							|  |  |  |     return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length) | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   private noSections(link: string): boolean { | 
					
						
							|  |  |  |     return !link.includes("#"); | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   private isInternalLink(link: string): boolean { | 
					
						
							|  |  |  |     const urlObj = new URL(link, this.baseUrl); | 
					
						
							| 
									
										
										
										
											2024-06-14 13:44:54 -07:00
										 |  |  |     const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim(); | 
					
						
							|  |  |  |     const linkDomain = urlObj.hostname.replace(/^www\./, "").trim(); | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     return linkDomain === baseDomain; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-07-18 21:49:37 +02:00
										 |  |  |   public isFile(url: string): boolean { | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     const fileExtensions = [ | 
					
						
							|  |  |  |       ".png", | 
					
						
							|  |  |  |       ".jpg", | 
					
						
							|  |  |  |       ".jpeg", | 
					
						
							|  |  |  |       ".gif", | 
					
						
							|  |  |  |       ".css", | 
					
						
							|  |  |  |       ".js", | 
					
						
							|  |  |  |       ".ico", | 
					
						
							|  |  |  |       ".svg", | 
					
						
							| 
									
										
										
										
											2024-07-18 17:07:21 -03:00
										 |  |  |       ".tiff", | 
					
						
							| 
									
										
										
										
											2024-04-18 11:43:57 -03:00
										 |  |  |       // ".pdf", 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       ".zip", | 
					
						
							|  |  |  |       ".exe", | 
					
						
							|  |  |  |       ".dmg", | 
					
						
							|  |  |  |       ".mp4", | 
					
						
							|  |  |  |       ".mp3", | 
					
						
							|  |  |  |       ".pptx", | 
					
						
							| 
									
										
										
										
											2024-05-16 11:48:02 -07:00
										 |  |  |       // ".docx",
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       ".xlsx", | 
					
						
							|  |  |  |       ".xml", | 
					
						
							| 
									
										
										
										
											2024-04-27 11:03:27 +01:00
										 |  |  |       ".avi", | 
					
						
							|  |  |  |       ".flv", | 
					
						
							|  |  |  |       ".woff", | 
					
						
							|  |  |  |       ".ttf", | 
					
						
							|  |  |  |       ".woff2", | 
					
						
							| 
									
										
										
										
											2024-07-31 09:28:43 -03:00
										 |  |  |       ".webp", | 
					
						
							|  |  |  |       ".inc" | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     ]; | 
					
						
							| 
									
										
										
										
											2024-10-14 15:44:45 -03:00
										 |  |  | 
 | 
					
						
							|  |  |  |     try { | 
					
						
							|  |  |  |       const urlWithoutQuery = url.split('?')[0].toLowerCase(); | 
					
						
							|  |  |  |       return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext)); | 
					
						
							|  |  |  |     } catch (error) { | 
					
						
							|  |  |  |       Logger.error(`Error processing URL in isFile: ${error}`); | 
					
						
							|  |  |  |       return false; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   private isSocialMediaOrEmail(url: string): boolean { | 
					
						
							|  |  |  |     const socialMediaOrEmail = [ | 
					
						
							|  |  |  |       "facebook.com", | 
					
						
							|  |  |  |       "twitter.com", | 
					
						
							|  |  |  |       "linkedin.com", | 
					
						
							|  |  |  |       "instagram.com", | 
					
						
							|  |  |  |       "pinterest.com", | 
					
						
							|  |  |  |       "mailto:", | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |       "github.com", | 
					
						
							|  |  |  |       "calendly.com", | 
					
						
							|  |  |  |       "discord.gg", | 
					
						
							|  |  |  |       "discord.com", | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     ]; | 
					
						
							|  |  |  |     return socialMediaOrEmail.some((ext) => url.includes(ext)); | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |   // 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   private async tryFetchSitemapLinks(url: string): Promise<string[]> { | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |     const normalizeUrl = (url: string) => { | 
					
						
							|  |  |  |       url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); | 
					
						
							|  |  |  |       if (url.endsWith("/")) { | 
					
						
							|  |  |  |         url = url.slice(0, -1); | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |       return url; | 
					
						
							|  |  |  |     }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     const sitemapUrl = url.endsWith("/sitemap.xml") | 
					
						
							|  |  |  |       ? url | 
					
						
							|  |  |  |       : `${url}/sitemap.xml`; | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  |     let sitemapLinks: string[] = []; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     try { | 
					
						
							| 
									
										
										
										
											2024-06-24 16:33:07 -03:00
										 |  |  |       const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       if (response.status === 200) { | 
					
						
							| 
									
										
										
										
											2024-07-09 16:07:53 -03:00
										 |  |  |         sitemapLinks = await getLinksFromSitemap({ sitemapUrl }); | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |     } catch (error) {  | 
					
						
							| 
									
										
										
										
											2024-07-23 17:30:46 -03:00
										 |  |  |       Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |       if (error instanceof AxiosError && error.response?.status === 404) { | 
					
						
							|  |  |  |         // ignore 404
 | 
					
						
							|  |  |  |       } else { | 
					
						
							|  |  |  |         const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' }); | 
					
						
							|  |  |  |         if (response) { | 
					
						
							|  |  |  |           sitemapLinks = response; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       } | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-05-15 15:30:37 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |     if (sitemapLinks.length === 0) { | 
					
						
							|  |  |  |       const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; | 
					
						
							|  |  |  |       try { | 
					
						
							| 
									
										
										
										
											2024-06-24 16:33:07 -03:00
										 |  |  |         const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout }); | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |         if (response.status === 200) { | 
					
						
							| 
									
										
										
										
											2024-08-02 11:03:01 -03:00
										 |  |  |           sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |         } | 
					
						
							|  |  |  |       } catch (error) { | 
					
						
							| 
									
										
										
										
											2024-07-23 17:30:46 -03:00
										 |  |  |         Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |         if (error instanceof AxiosError && error.response?.status === 404) { | 
					
						
							|  |  |  |           // ignore 404
 | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  |           sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-05-15 15:30:37 -07:00
										 |  |  |       } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |     const normalizedUrl = normalizeUrl(url); | 
					
						
							|  |  |  |     const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); | 
					
						
							| 
									
										
										
										
											2024-06-24 16:52:01 -03:00
										 |  |  |     // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
 | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |     if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) { | 
					
						
							|  |  |  |       sitemapLinks.push(url); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return sitemapLinks; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   } | 
					
						
							|  |  |  | } |