| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  | import axios, { AxiosError } from "axios"; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  | import cheerio, { load } from "cheerio"; | 
					
						
							|  |  |  | import { URL } from "url"; | 
					
						
							|  |  |  | import { getLinksFromSitemap } from "./sitemap"; | 
					
						
							|  |  |  | import robotsParser from "robots-parser"; | 
					
						
							| 
									
										
										
										
											2024-06-15 16:43:37 -04:00
										 |  |  | import { getURLDepth } from "./utils/maxDepthUtils"; | 
					
						
							| 
									
										
										
										
											2024-06-24 16:33:07 -03:00
										 |  |  | import { axiosTimeout } from "../../../src/lib/timeout"; | 
					
						
							| 
									
										
										
										
											2024-11-07 20:57:33 +01:00
										 |  |  | import { logger } from "../../../src/lib/logger"; | 
					
						
							| 
									
										
										
										
											2024-10-23 01:07:03 +03:00
										 |  |  | import https from "https"; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  | export class WebCrawler { | 
					
						
							| 
									
										
										
										
											2024-07-24 14:31:25 +02:00
										 |  |  |   private jobId: string; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   private initialUrl: string; | 
					
						
							|  |  |  |   private baseUrl: string; | 
					
						
							|  |  |  |   private includes: string[]; | 
					
						
							|  |  |  |   private excludes: string[]; | 
					
						
							|  |  |  |   private maxCrawledLinks: number; | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  |   private maxCrawledDepth: number; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   private visited: Set<string> = new Set(); | 
					
						
							| 
									
										
										
										
											2024-05-14 12:12:40 -07:00
										 |  |  |   private crawledUrls: Map<string, string> = new Map(); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   private limit: number; | 
					
						
							|  |  |  |   private robotsTxtUrl: string; | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |   public robots: any; | 
					
						
							| 
									
										
										
										
											2024-04-16 12:49:14 -04:00
										 |  |  |   private generateImgAltText: boolean; | 
					
						
							| 
									
										
										
										
											2024-06-11 15:24:39 -03:00
										 |  |  |   private allowBackwardCrawling: boolean; | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |   private allowExternalContentLinks: boolean; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |   constructor({ | 
					
						
							| 
									
										
										
										
											2024-07-24 14:31:25 +02:00
										 |  |  |     jobId, | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     initialUrl, | 
					
						
							|  |  |  |     includes, | 
					
						
							|  |  |  |     excludes, | 
					
						
							| 
									
										
										
										
											2024-05-10 12:15:54 -03:00
										 |  |  |     maxCrawledLinks = 10000, | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     limit = 10000, | 
					
						
							| 
									
										
										
										
											2024-04-16 12:49:14 -04:00
										 |  |  |     generateImgAltText = false, | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  |     maxCrawledDepth = 10, | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |     allowBackwardCrawling = false, | 
					
						
							|  |  |  |     allowExternalContentLinks = false | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   }: { | 
					
						
							| 
									
										
										
										
											2024-07-24 14:31:25 +02:00
										 |  |  |     jobId: string; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     initialUrl: string; | 
					
						
							|  |  |  |     includes?: string[]; | 
					
						
							|  |  |  |     excludes?: string[]; | 
					
						
							|  |  |  |     maxCrawledLinks?: number; | 
					
						
							|  |  |  |     limit?: number; | 
					
						
							| 
									
										
										
										
											2024-04-16 12:49:14 -04:00
										 |  |  |     generateImgAltText?: boolean; | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  |     maxCrawledDepth?: number; | 
					
						
							| 
									
										
										
										
											2024-06-11 15:24:39 -03:00
										 |  |  |     allowBackwardCrawling?: boolean; | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |     allowExternalContentLinks?: boolean; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   }) { | 
					
						
							| 
									
										
										
										
											2024-07-24 14:31:25 +02:00
										 |  |  |     this.jobId = jobId; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     this.initialUrl = initialUrl; | 
					
						
							|  |  |  |     this.baseUrl = new URL(initialUrl).origin; | 
					
						
							| 
									
										
										
										
											2024-08-22 13:18:26 +02:00
										 |  |  |     this.includes = Array.isArray(includes) ? includes : []; | 
					
						
							|  |  |  |     this.excludes = Array.isArray(excludes) ? excludes : []; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     this.limit = limit; | 
					
						
							|  |  |  |     this.robotsTxtUrl = `${this.baseUrl}/robots.txt`; | 
					
						
							|  |  |  |     this.robots = robotsParser(this.robotsTxtUrl, ""); | 
					
						
							|  |  |  |     // Deprecated, use limit instead
 | 
					
						
							|  |  |  |     this.maxCrawledLinks = maxCrawledLinks ?? limit; | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  |     this.maxCrawledDepth = maxCrawledDepth ?? 10; | 
					
						
							| 
									
										
										
										
											2024-04-16 12:49:14 -04:00
										 |  |  |     this.generateImgAltText = generateImgAltText ?? false; | 
					
						
							| 
									
										
										
										
											2024-06-11 15:24:39 -03:00
										 |  |  |     this.allowBackwardCrawling = allowBackwardCrawling ?? false; | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |     this.allowExternalContentLinks = allowExternalContentLinks ?? false; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |   public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     return sitemapLinks | 
					
						
							|  |  |  |       .filter((link) => { | 
					
						
							| 
									
										
										
										
											2024-08-20 09:11:58 -03:00
										 |  |  |         let url: URL; | 
					
						
							|  |  |  |         try { | 
					
						
							|  |  |  |           url = new URL(link.trim(), this.baseUrl); | 
					
						
							|  |  |  |         } catch (error) { | 
					
						
							| 
									
										
										
										
											2024-11-07 20:57:33 +01:00
										 |  |  |           logger.debug(`Error processing link: ${link} | Error: ${error.message}`); | 
					
						
							| 
									
										
										
										
											2024-08-20 09:11:58 -03:00
										 |  |  |           return false; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |         const path = url.pathname; | 
					
						
							| 
									
										
										
										
											2024-06-14 19:40:37 -04:00
										 |  |  |          | 
					
						
							| 
									
										
										
										
											2024-06-15 16:43:37 -04:00
										 |  |  |         const depth = getURLDepth(url.toString()); | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-14 19:40:37 -04:00
										 |  |  |          | 
					
						
							| 
									
										
										
										
											2024-05-07 11:06:26 -03:00
										 |  |  |         // Check if the link exceeds the maximum depth allowed
 | 
					
						
							|  |  |  |         if (depth > maxDepth) { | 
					
						
							|  |  |  |           return false; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |         // Check if the link should be excluded
 | 
					
						
							|  |  |  |         if (this.excludes.length > 0 && this.excludes[0] !== "") { | 
					
						
							|  |  |  |           if ( | 
					
						
							|  |  |  |             this.excludes.some((excludePattern) => | 
					
						
							|  |  |  |               new RegExp(excludePattern).test(path) | 
					
						
							|  |  |  |             ) | 
					
						
							|  |  |  |           ) { | 
					
						
							|  |  |  |             return false; | 
					
						
							|  |  |  |           } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Check if the link matches the include patterns, if any are specified
 | 
					
						
							|  |  |  |         if (this.includes.length > 0 && this.includes[0] !== "") { | 
					
						
							| 
									
										
										
										
											2024-05-15 15:30:37 -07:00
										 |  |  |           if (!this.includes.some((includePattern) => | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |             new RegExp(includePattern).test(path) | 
					
						
							| 
									
										
										
										
											2024-05-15 15:30:37 -07:00
										 |  |  |           )) { | 
					
						
							|  |  |  |             return false; | 
					
						
							|  |  |  |           } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Normalize the initial URL and the link to account for www and non-www versions
 | 
					
						
							|  |  |  |         const normalizedInitialUrl = new URL(this.initialUrl); | 
					
						
							| 
									
										
										
										
											2024-08-22 23:30:19 +02:00
										 |  |  |         let normalizedLink; | 
					
						
							|  |  |  |         try { | 
					
						
							|  |  |  |           normalizedLink = new URL(link); | 
					
						
							|  |  |  |         } catch (_) { | 
					
						
							|  |  |  |           return false; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-05-15 15:30:37 -07:00
										 |  |  |         const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, ''); | 
					
						
							|  |  |  |         const linkHostname = normalizedLink.hostname.replace(/^www\./, ''); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Ensure the protocol and hostname match, and the path starts with the initial URL's path
 | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |         // commented to able to handling external link on allowExternalContentLinks
 | 
					
						
							|  |  |  |         // if (linkHostname !== initialHostname) {
 | 
					
						
							|  |  |  |         //   return false;
 | 
					
						
							|  |  |  |         // }
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-11 15:24:39 -03:00
										 |  |  |         if (!this.allowBackwardCrawling) { | 
					
						
							|  |  |  |           if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) { | 
					
						
							|  |  |  |             return false; | 
					
						
							|  |  |  |           } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |         const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; | 
					
						
							|  |  |  |         // Check if the link is disallowed by robots.txt
 | 
					
						
							|  |  |  |         if (!isAllowed) { | 
					
						
							| 
									
										
										
										
											2024-11-07 20:57:33 +01:00
										 |  |  |           logger.debug(`Link disallowed by robots.txt: ${link}`); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |           return false; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-14 15:44:45 -03:00
										 |  |  |         if (this.isFile(link)) { | 
					
						
							|  |  |  |           return false; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |         return true; | 
					
						
							|  |  |  |       }) | 
					
						
							|  |  |  |       .slice(0, limit); | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-23 01:07:03 +03:00
										 |  |  |   public async getRobotsTxt(skipTlsVerification = false): Promise<string> { | 
					
						
							|  |  |  |     let extraArgs = {}; | 
					
						
							|  |  |  |     if(skipTlsVerification) { | 
					
						
							|  |  |  |       extraArgs["httpsAgent"] = new https.Agent({ | 
					
						
							|  |  |  |         rejectUnauthorized: false | 
					
						
							|  |  |  |       }); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs }); | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |     return response.data; | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   public importRobotsTxt(txt: string) { | 
					
						
							|  |  |  |     this.robots = robotsParser(this.robotsTxtUrl, txt); | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> { | 
					
						
							| 
									
										
										
										
											2024-11-07 20:57:33 +01:00
										 |  |  |     logger.debug(`Fetching sitemap links from ${this.initialUrl}`); | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |     const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); | 
					
						
							|  |  |  |     if (sitemapLinks.length > 0) { | 
					
						
							|  |  |  |       let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth); | 
					
						
							|  |  |  |       return filteredLinks.map(link => ({ url: link, html: "" })); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return null; | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   public filterURL(href: string, url: string): string | null { | 
					
						
							|  |  |  |     let fullUrl = href; | 
					
						
							|  |  |  |     if (!href.startsWith("http")) { | 
					
						
							| 
									
										
										
										
											2024-08-21 20:49:25 +02:00
										 |  |  |       try { | 
					
						
							| 
									
										
										
										
											2024-11-12 18:20:53 +01:00
										 |  |  |         fullUrl = new URL(href, url).toString(); | 
					
						
							| 
									
										
										
										
											2024-08-21 20:49:25 +02:00
										 |  |  |       } catch (_) { | 
					
						
							|  |  |  |         return null; | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     let urlObj; | 
					
						
							|  |  |  |     try { | 
					
						
							|  |  |  |       urlObj = new URL(fullUrl); | 
					
						
							|  |  |  |     } catch (_) { | 
					
						
							|  |  |  |       return null; | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |     } | 
					
						
							|  |  |  |     const path = urlObj.pathname; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
 | 
					
						
							|  |  |  |       if (this.isInternalLink(fullUrl) && | 
					
						
							|  |  |  |         this.noSections(fullUrl) && | 
					
						
							|  |  |  |         !this.matchesExcludes(path) && | 
					
						
							|  |  |  |         this.isRobotsAllowed(fullUrl) | 
					
						
							|  |  |  |       ) { | 
					
						
							|  |  |  |         return fullUrl; | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |     } else { // EXTERNAL LINKS
 | 
					
						
							|  |  |  |       if ( | 
					
						
							|  |  |  |         this.isInternalLink(url) && | 
					
						
							|  |  |  |         this.allowExternalContentLinks && | 
					
						
							|  |  |  |         !this.isSocialMediaOrEmail(fullUrl) && | 
					
						
							|  |  |  |         !this.matchesExcludes(fullUrl, true) && | 
					
						
							|  |  |  |         !this.isExternalMainPage(fullUrl) | 
					
						
							|  |  |  |       ) { | 
					
						
							|  |  |  |         return fullUrl; | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return null; | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-16 23:29:30 +02:00
										 |  |  |   public extractLinksFromHTML(html: string, url: string) { | 
					
						
							|  |  |  |     let links: string[] = []; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     const $ = load(html); | 
					
						
							|  |  |  |     $("a").each((_, element) => { | 
					
						
							|  |  |  |       const href = $(element).attr("href"); | 
					
						
							|  |  |  |       if (href) { | 
					
						
							|  |  |  |         const u = this.filterURL(href, url); | 
					
						
							|  |  |  |         if (u !== null) { | 
					
						
							|  |  |  |           links.push(u); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |     }); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-10-31 10:53:47 -03:00
										 |  |  |     // Extract links from iframes with inline src
 | 
					
						
							|  |  |  |     $("iframe").each((_, element) => { | 
					
						
							|  |  |  |       const src = $(element).attr("src"); | 
					
						
							|  |  |  |       if (src && src.startsWith("data:text/html")) { | 
					
						
							|  |  |  |         const iframeHtml = decodeURIComponent(src.split(",")[1]); | 
					
						
							|  |  |  |         const iframeLinks = this.extractLinksFromHTML(iframeHtml, url); | 
					
						
							|  |  |  |         links = links.concat(iframeLinks); | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |     }); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-08-16 23:29:30 +02:00
										 |  |  |     return links; | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-14 13:44:54 -07:00
										 |  |  |   private isRobotsAllowed(url: string): boolean { | 
					
						
							|  |  |  |     return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true) | 
					
						
							|  |  |  |   } | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |   private matchesExcludes(url: string, onlyDomains: boolean = false): boolean { | 
					
						
							|  |  |  |     return this.excludes.some((pattern) => { | 
					
						
							|  |  |  |       if (onlyDomains) | 
					
						
							|  |  |  |         return this.matchesExcludesExternalDomains(url); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       return this.excludes.some((pattern) => new RegExp(pattern).test(url)); | 
					
						
							|  |  |  |     }); | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   // supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com"
 | 
					
						
							|  |  |  |   private matchesExcludesExternalDomains(url: string) { | 
					
						
							|  |  |  |     try { | 
					
						
							|  |  |  |       const urlObj = new URL(url); | 
					
						
							|  |  |  |       const hostname = urlObj.hostname; | 
					
						
							|  |  |  |       const pathname = urlObj.pathname; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       for (let domain of this.excludes) { | 
					
						
							|  |  |  |         let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, '')); | 
					
						
							|  |  |  |         let domainHostname = domainObj.hostname; | 
					
						
							|  |  |  |         let domainPathname = domainObj.pathname; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) { | 
					
						
							|  |  |  |           if (pathname.startsWith(domainPathname)) { | 
					
						
							|  |  |  |             return true; | 
					
						
							|  |  |  |           } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |       return false; | 
					
						
							|  |  |  |     } catch (e) { | 
					
						
							|  |  |  |       return false; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   private isExternalMainPage(url:string):boolean { | 
					
						
							|  |  |  |     return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length) | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   private noSections(link: string): boolean { | 
					
						
							|  |  |  |     return !link.includes("#"); | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   private isInternalLink(link: string): boolean { | 
					
						
							|  |  |  |     const urlObj = new URL(link, this.baseUrl); | 
					
						
							| 
									
										
										
										
											2024-06-14 13:44:54 -07:00
										 |  |  |     const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim(); | 
					
						
							|  |  |  |     const linkDomain = urlObj.hostname.replace(/^www\./, "").trim(); | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     return linkDomain === baseDomain; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-07-18 21:49:37 +02:00
										 |  |  |   public isFile(url: string): boolean { | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     const fileExtensions = [ | 
					
						
							|  |  |  |       ".png", | 
					
						
							|  |  |  |       ".jpg", | 
					
						
							|  |  |  |       ".jpeg", | 
					
						
							|  |  |  |       ".gif", | 
					
						
							|  |  |  |       ".css", | 
					
						
							|  |  |  |       ".js", | 
					
						
							|  |  |  |       ".ico", | 
					
						
							|  |  |  |       ".svg", | 
					
						
							| 
									
										
										
										
											2024-07-18 17:07:21 -03:00
										 |  |  |       ".tiff", | 
					
						
							| 
									
										
										
										
											2024-04-18 11:43:57 -03:00
										 |  |  |       // ".pdf", 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       ".zip", | 
					
						
							|  |  |  |       ".exe", | 
					
						
							|  |  |  |       ".dmg", | 
					
						
							|  |  |  |       ".mp4", | 
					
						
							|  |  |  |       ".mp3", | 
					
						
							|  |  |  |       ".pptx", | 
					
						
							| 
									
										
										
										
											2024-05-16 11:48:02 -07:00
										 |  |  |       // ".docx",
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       ".xlsx", | 
					
						
							|  |  |  |       ".xml", | 
					
						
							| 
									
										
										
										
											2024-04-27 11:03:27 +01:00
										 |  |  |       ".avi", | 
					
						
							|  |  |  |       ".flv", | 
					
						
							|  |  |  |       ".woff", | 
					
						
							|  |  |  |       ".ttf", | 
					
						
							|  |  |  |       ".woff2", | 
					
						
							| 
									
										
										
										
											2024-07-31 09:28:43 -03:00
										 |  |  |       ".webp", | 
					
						
							|  |  |  |       ".inc" | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     ]; | 
					
						
							| 
									
										
										
										
											2024-10-14 15:44:45 -03:00
										 |  |  | 
 | 
					
						
							|  |  |  |     try { | 
					
						
							|  |  |  |       const urlWithoutQuery = url.split('?')[0].toLowerCase(); | 
					
						
							|  |  |  |       return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext)); | 
					
						
							|  |  |  |     } catch (error) { | 
					
						
							| 
									
										
										
										
											2024-11-07 20:57:33 +01:00
										 |  |  |       logger.error(`Error processing URL in isFile: ${error}`); | 
					
						
							| 
									
										
										
										
											2024-10-14 15:44:45 -03:00
										 |  |  |       return false; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   private isSocialMediaOrEmail(url: string): boolean { | 
					
						
							|  |  |  |     const socialMediaOrEmail = [ | 
					
						
							|  |  |  |       "facebook.com", | 
					
						
							|  |  |  |       "twitter.com", | 
					
						
							|  |  |  |       "linkedin.com", | 
					
						
							|  |  |  |       "instagram.com", | 
					
						
							|  |  |  |       "pinterest.com", | 
					
						
							|  |  |  |       "mailto:", | 
					
						
							| 
									
										
										
										
											2024-06-28 17:23:40 -07:00
										 |  |  |       "github.com", | 
					
						
							|  |  |  |       "calendly.com", | 
					
						
							|  |  |  |       "discord.gg", | 
					
						
							|  |  |  |       "discord.com", | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     ]; | 
					
						
							|  |  |  |     return socialMediaOrEmail.some((ext) => url.includes(ext)); | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   private async tryFetchSitemapLinks(url: string): Promise<string[]> { | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |     const normalizeUrl = (url: string) => { | 
					
						
							|  |  |  |       url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); | 
					
						
							|  |  |  |       if (url.endsWith("/")) { | 
					
						
							|  |  |  |         url = url.slice(0, -1); | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |       return url; | 
					
						
							|  |  |  |     }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     const sitemapUrl = url.endsWith("/sitemap.xml") | 
					
						
							|  |  |  |       ? url | 
					
						
							|  |  |  |       : `${url}/sitemap.xml`; | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  |     let sitemapLinks: string[] = []; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |     try { | 
					
						
							| 
									
										
										
										
											2024-06-24 16:33:07 -03:00
										 |  |  |       const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       if (response.status === 200) { | 
					
						
							| 
									
										
										
										
											2024-07-09 16:07:53 -03:00
										 |  |  |         sitemapLinks = await getLinksFromSitemap({ sitemapUrl }); | 
					
						
							|  |  |  |       } | 
					
						
							|  |  |  |     } catch (error) {  | 
					
						
							| 
									
										
										
										
											2024-11-07 20:57:33 +01:00
										 |  |  |       logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |       if (error instanceof AxiosError && error.response?.status === 404) { | 
					
						
							|  |  |  |         // ignore 404
 | 
					
						
							|  |  |  |       } else { | 
					
						
							|  |  |  |         const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' }); | 
					
						
							|  |  |  |         if (response) { | 
					
						
							|  |  |  |           sitemapLinks = response; | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |       } | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-05-15 15:30:37 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |     if (sitemapLinks.length === 0) { | 
					
						
							|  |  |  |       const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; | 
					
						
							|  |  |  |       try { | 
					
						
							| 
									
										
										
										
											2024-06-24 16:33:07 -03:00
										 |  |  |         const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout }); | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |         if (response.status === 200) { | 
					
						
							| 
									
										
										
										
											2024-08-02 11:03:01 -03:00
										 |  |  |           sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |         } | 
					
						
							|  |  |  |       } catch (error) { | 
					
						
							| 
									
										
										
										
											2024-11-07 20:57:33 +01:00
										 |  |  |         logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); | 
					
						
							| 
									
										
										
										
											2024-08-13 20:51:43 +02:00
										 |  |  |         if (error instanceof AxiosError && error.response?.status === 404) { | 
					
						
							|  |  |  |           // ignore 404
 | 
					
						
							|  |  |  |         } else { | 
					
						
							|  |  |  |           sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); | 
					
						
							|  |  |  |         } | 
					
						
							| 
									
										
										
										
											2024-05-15 15:30:37 -07:00
										 |  |  |       } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |     const normalizedUrl = normalizeUrl(url); | 
					
						
							|  |  |  |     const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); | 
					
						
							| 
									
										
										
										
											2024-06-24 16:52:01 -03:00
										 |  |  |     // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
 | 
					
						
							| 
									
										
										
										
											2024-05-15 17:13:04 -07:00
										 |  |  |     if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) { | 
					
						
							|  |  |  |       sitemapLinks.push(url); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |     return sitemapLinks; | 
					
						
							| 
									
										
										
										
											2024-04-15 17:01:47 -04:00
										 |  |  |   } | 
					
						
							|  |  |  | } |