478 lines
15 KiB
TypeScript
Raw Normal View History

2024-04-15 17:01:47 -04:00
import axios from "axios";
import cheerio, { load } from "cheerio";
import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap";
import async from "async";
import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
2024-07-03 18:01:17 -03:00
import { scrapSingleUrl } from "./single_url";
2024-04-15 17:01:47 -04:00
import robotsParser from "robots-parser";
2024-06-15 16:43:37 -04:00
import { getURLDepth } from "./utils/maxDepthUtils";
2024-06-24 16:33:07 -03:00
import { axiosTimeout } from "../../../src/lib/timeout";
2024-04-15 17:01:47 -04:00
export class WebCrawler {
private initialUrl: string;
private baseUrl: string;
private includes: string[];
private excludes: string[];
private maxCrawledLinks: number;
2024-05-07 11:06:26 -03:00
private maxCrawledDepth: number;
2024-04-15 17:01:47 -04:00
private visited: Set<string> = new Set();
2024-05-14 12:12:40 -07:00
private crawledUrls: Map<string, string> = new Map();
2024-04-15 17:01:47 -04:00
private limit: number;
private robotsTxtUrl: string;
private robots: any;
2024-04-16 12:49:14 -04:00
private generateImgAltText: boolean;
2024-06-11 15:24:39 -03:00
private allowBackwardCrawling: boolean;
2024-06-28 17:23:40 -07:00
private allowExternalContentLinks: boolean;
2024-04-15 17:01:47 -04:00
constructor({
initialUrl,
includes,
excludes,
2024-05-10 12:15:54 -03:00
maxCrawledLinks = 10000,
2024-04-15 17:01:47 -04:00
limit = 10000,
2024-04-16 12:49:14 -04:00
generateImgAltText = false,
2024-05-07 11:06:26 -03:00
maxCrawledDepth = 10,
2024-06-28 17:23:40 -07:00
allowBackwardCrawling = false,
allowExternalContentLinks = false
2024-04-15 17:01:47 -04:00
}: {
initialUrl: string;
includes?: string[];
excludes?: string[];
maxCrawledLinks?: number;
limit?: number;
2024-04-16 12:49:14 -04:00
generateImgAltText?: boolean;
2024-05-07 11:06:26 -03:00
maxCrawledDepth?: number;
2024-06-11 15:24:39 -03:00
allowBackwardCrawling?: boolean;
2024-06-28 17:23:40 -07:00
allowExternalContentLinks?: boolean;
2024-04-15 17:01:47 -04:00
}) {
this.initialUrl = initialUrl;
this.baseUrl = new URL(initialUrl).origin;
this.includes = includes ?? [];
this.excludes = excludes ?? [];
this.limit = limit;
this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
this.robots = robotsParser(this.robotsTxtUrl, "");
// Deprecated, use limit instead
this.maxCrawledLinks = maxCrawledLinks ?? limit;
2024-05-07 11:06:26 -03:00
this.maxCrawledDepth = maxCrawledDepth ?? 10;
2024-04-16 12:49:14 -04:00
this.generateImgAltText = generateImgAltText ?? false;
2024-06-11 15:24:39 -03:00
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
2024-06-28 17:23:40 -07:00
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
2024-04-15 17:01:47 -04:00
}
2024-05-07 11:06:26 -03:00
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
2024-04-15 17:01:47 -04:00
return sitemapLinks
.filter((link) => {
const url = new URL(link);
const path = url.pathname;
2024-06-14 19:40:37 -04:00
2024-06-15 16:43:37 -04:00
const depth = getURLDepth(url.toString());
2024-05-07 11:06:26 -03:00
2024-06-14 19:40:37 -04:00
2024-05-07 11:06:26 -03:00
// Check if the link exceeds the maximum depth allowed
if (depth > maxDepth) {
return false;
}
2024-04-15 17:01:47 -04:00
// Check if the link should be excluded
if (this.excludes.length > 0 && this.excludes[0] !== "") {
if (
this.excludes.some((excludePattern) =>
new RegExp(excludePattern).test(path)
)
) {
return false;
}
}
// Check if the link matches the include patterns, if any are specified
if (this.includes.length > 0 && this.includes[0] !== "") {
2024-05-15 15:30:37 -07:00
if (!this.includes.some((includePattern) =>
2024-04-15 17:01:47 -04:00
new RegExp(includePattern).test(path)
2024-05-15 15:30:37 -07:00
)) {
return false;
}
}
// Normalize the initial URL and the link to account for www and non-www versions
const normalizedInitialUrl = new URL(this.initialUrl);
const normalizedLink = new URL(link);
const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
2024-06-28 17:23:40 -07:00
// commented to able to handling external link on allowExternalContentLinks
// if (linkHostname !== initialHostname) {
// return false;
// }
2024-04-15 17:01:47 -04:00
2024-06-11 15:24:39 -03:00
if (!this.allowBackwardCrawling) {
if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
return false;
}
}
2024-04-15 17:01:47 -04:00
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
// Check if the link is disallowed by robots.txt
if (!isAllowed) {
console.log(`Link disallowed by robots.txt: ${link}`);
return false;
}
return true;
})
.slice(0, limit);
}
public async start(
inProgress?: (progress: Progress) => void,
2024-06-10 16:27:10 -07:00
pageOptions?: PageOptions,
crawlerOptions?: CrawlerOptions,
2024-04-15 17:01:47 -04:00
concurrencyLimit: number = 5,
2024-05-07 11:06:26 -03:00
limit: number = 10000,
maxDepth: number = 10
2024-05-13 20:45:11 -07:00
): Promise<{ url: string, html: string }[]> {
2024-04-15 17:01:47 -04:00
// Fetch and parse robots.txt
try {
2024-06-24 16:33:07 -03:00
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
2024-04-15 17:01:47 -04:00
this.robots = robotsParser(this.robotsTxtUrl, response.data);
} catch (error) {
2024-05-20 13:36:34 -07:00
console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
2024-04-15 17:01:47 -04:00
}
if(!crawlerOptions?.ignoreSitemap){
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
if (sitemapLinks.length > 0) {
let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
return filteredLinks.map(link => ({ url: link, html: "" }));
}
2024-04-15 17:01:47 -04:00
}
const urls = await this.crawlUrls(
[this.initialUrl],
2024-06-10 16:27:10 -07:00
pageOptions,
2024-04-15 17:01:47 -04:00
concurrencyLimit,
inProgress
);
2024-06-14 19:40:37 -04:00
2024-04-15 17:01:47 -04:00
if (
urls.length === 0 &&
2024-05-07 11:06:26 -03:00
this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
2024-04-15 17:01:47 -04:00
) {
2024-05-13 20:45:11 -07:00
return [{ url: this.initialUrl, html: "" }];
2024-04-15 17:01:47 -04:00
}
// make sure to run include exclude here again
2024-05-13 20:45:11 -07:00
const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth);
return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" }));
2024-04-15 17:01:47 -04:00
}
private async crawlUrls(
urls: string[],
2024-06-10 16:27:10 -07:00
pageOptions: PageOptions,
2024-04-15 17:01:47 -04:00
concurrencyLimit: number,
2024-05-15 17:13:04 -07:00
inProgress?: (progress: Progress) => void,
2024-05-13 20:45:11 -07:00
): Promise<{ url: string, html: string }[]> {
2024-04-15 17:01:47 -04:00
const queue = async.queue(async (task: string, callback) => {
2024-05-10 12:15:54 -03:00
if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) {
2024-04-15 17:01:47 -04:00
if (callback && typeof callback === "function") {
callback();
}
return;
}
2024-06-10 16:27:10 -07:00
const newUrls = await this.crawl(task, pageOptions);
2024-05-15 17:13:04 -07:00
// add the initial url if not already added
// if (this.visited.size === 1) {
// let normalizedInitial = this.initialUrl;
// if (!normalizedInitial.endsWith("/")) {
// normalizedInitial = normalizedInitial + "/";
// }
// if (!newUrls.some(page => page.url === this.initialUrl)) {
// newUrls.push({ url: this.initialUrl, html: "" });
// }
// }
2024-05-14 12:12:40 -07:00
newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
2024-05-15 17:13:04 -07:00
2024-04-15 17:01:47 -04:00
if (inProgress && newUrls.length > 0) {
inProgress({
current: this.crawledUrls.size,
2024-05-10 12:15:54 -03:00
total: Math.min(this.maxCrawledLinks, this.limit),
2024-04-15 17:01:47 -04:00
status: "SCRAPING",
2024-05-13 20:45:11 -07:00
currentDocumentUrl: newUrls[newUrls.length - 1].url,
2024-04-15 17:01:47 -04:00
});
} else if (inProgress) {
inProgress({
current: this.crawledUrls.size,
2024-05-10 12:15:54 -03:00
total: Math.min(this.maxCrawledLinks, this.limit),
2024-04-15 17:01:47 -04:00
status: "SCRAPING",
currentDocumentUrl: task,
});
}
2024-06-10 16:27:10 -07:00
await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress);
2024-04-15 17:01:47 -04:00
if (callback && typeof callback === "function") {
callback();
}
}, concurrencyLimit);
queue.push(
urls.filter(
(url) =>
!this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent")
),
(err) => {
if (err) console.error(err);
}
);
await queue.drain();
2024-05-14 12:12:40 -07:00
return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html }));
2024-04-15 17:01:47 -04:00
}
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
2024-06-14 13:44:54 -07:00
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
2024-04-15 17:01:47 -04:00
return [];
2024-05-15 17:13:04 -07:00
}
2024-06-14 13:44:54 -07:00
this.visited.add(url);
2024-05-15 17:13:04 -07:00
2024-04-15 17:01:47 -04:00
if (!url.startsWith("http")) {
url = "https://" + url;
}
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
2024-04-15 17:01:47 -04:00
if (this.isFile(url) || this.isSocialMediaOrEmail(url)) {
return [];
}
try {
let content: string = "";
let pageStatusCode: number;
let pageError: string | undefined = undefined;
2024-05-13 20:45:11 -07:00
// If it is the first link, fetch with single url
2024-04-15 17:01:47 -04:00
if (this.visited.size === 1) {
const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true });
content = page.html ?? "";
pageStatusCode = page.metadata?.pageStatusCode;
pageError = page.metadata?.pageError || undefined;
2024-04-15 17:01:47 -04:00
} else {
2024-06-24 16:33:07 -03:00
const response = await axios.get(url, { timeout: axiosTimeout });
2024-05-13 20:45:11 -07:00
content = response.data ?? "";
pageStatusCode = response.status;
pageError = response.statusText != "OK" ? response.statusText : undefined;
2024-04-15 17:01:47 -04:00
}
2024-04-15 17:01:47 -04:00
const $ = load(content);
let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];
2024-04-15 17:01:47 -04:00
2024-05-15 17:13:04 -07:00
// Add the initial URL to the list of links
if (this.visited.size === 1) {
links.push({ url, html: content, pageStatusCode, pageError });
2024-05-15 17:13:04 -07:00
}
2024-04-15 17:01:47 -04:00
$("a").each((_, element) => {
const href = $(element).attr("href");
if (href) {
let fullUrl = href;
if (!href.startsWith("http")) {
fullUrl = new URL(href, this.baseUrl).toString();
}
const urlObj = new URL(fullUrl);
const path = urlObj.pathname;
2024-04-15 17:01:47 -04:00
2024-06-14 13:44:54 -07:00
2024-07-01 16:05:34 -03:00
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
2024-06-28 17:23:40 -07:00
if (this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) &&
!this.matchesExcludes(path) &&
this.isRobotsAllowed(fullUrl)
) {
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
2024-07-01 16:05:34 -03:00
} else { // EXTERNAL LINKS
if (
this.isInternalLink(url) &&
2024-06-28 17:23:40 -07:00
this.allowExternalContentLinks &&
!this.isSocialMediaOrEmail(fullUrl) &&
!this.matchesExcludes(fullUrl, true) &&
!this.isExternalMainPage(fullUrl)
2024-07-01 16:05:34 -03:00
) {
2024-06-28 17:23:40 -07:00
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
2024-04-15 17:01:47 -04:00
}
}
});
2024-06-24 16:43:12 -03:00
if (this.visited.size === 1) {
2024-05-15 17:13:04 -07:00
return links;
}
2024-05-13 20:45:11 -07:00
// Create a new list to return to avoid modifying the visited list
2024-06-14 13:44:54 -07:00
return links.filter((link) => !this.visited.has(link.url));
2024-04-15 17:01:47 -04:00
} catch (error) {
return [];
}
}
2024-06-14 13:44:54 -07:00
private isRobotsAllowed(url: string): boolean {
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
}
private normalizeCrawlUrl(url: string): string {
try{
const urlObj = new URL(url);
urlObj.searchParams.sort(); // Sort query parameters to normalize
return urlObj.toString();
} catch (error) {
return url;
}
}
2024-04-15 17:01:47 -04:00
private matchesIncludes(url: string): boolean {
if (this.includes.length === 0 || this.includes[0] == "") return true;
return this.includes.some((pattern) => new RegExp(pattern).test(url));
}
2024-06-28 17:23:40 -07:00
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
return this.excludes.some((pattern) => {
if (onlyDomains)
return this.matchesExcludesExternalDomains(url);
return this.excludes.some((pattern) => new RegExp(pattern).test(url));
});
}
// supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com"
private matchesExcludesExternalDomains(url: string) {
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname;
const pathname = urlObj.pathname;
for (let domain of this.excludes) {
let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, ''));
let domainHostname = domainObj.hostname;
let domainPathname = domainObj.pathname;
if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) {
if (pathname.startsWith(domainPathname)) {
return true;
}
}
}
return false;
} catch (e) {
return false;
}
}
private isExternalMainPage(url:string):boolean {
return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length)
2024-04-15 17:01:47 -04:00
}
private noSections(link: string): boolean {
return !link.includes("#");
}
private isInternalLink(link: string): boolean {
const urlObj = new URL(link, this.baseUrl);
2024-06-14 13:44:54 -07:00
const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim();
const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
return linkDomain === baseDomain;
2024-04-15 17:01:47 -04:00
}
private isFile(url: string): boolean {
const fileExtensions = [
".png",
".jpg",
".jpeg",
".gif",
".css",
".js",
".ico",
".svg",
2024-04-18 11:43:57 -03:00
// ".pdf",
2024-04-15 17:01:47 -04:00
".zip",
".exe",
".dmg",
".mp4",
".mp3",
".pptx",
2024-05-16 11:48:02 -07:00
// ".docx",
2024-04-15 17:01:47 -04:00
".xlsx",
".xml",
".avi",
".flv",
".woff",
".ttf",
".woff2",
2024-04-27 11:17:10 -07:00
".webp"
2024-04-15 17:01:47 -04:00
];
return fileExtensions.some((ext) => url.endsWith(ext));
}
private isSocialMediaOrEmail(url: string): boolean {
const socialMediaOrEmail = [
"facebook.com",
"twitter.com",
"linkedin.com",
"instagram.com",
"pinterest.com",
"mailto:",
2024-06-28 17:23:40 -07:00
"github.com",
"calendly.com",
"discord.gg",
"discord.com",
2024-04-15 17:01:47 -04:00
];
return socialMediaOrEmail.some((ext) => url.includes(ext));
}
2024-05-15 17:13:04 -07:00
//
2024-04-15 17:01:47 -04:00
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
2024-05-15 17:13:04 -07:00
const normalizeUrl = (url: string) => {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
};
2024-04-15 17:01:47 -04:00
const sitemapUrl = url.endsWith("/sitemap.xml")
? url
: `${url}/sitemap.xml`;
2024-05-15 17:13:04 -07:00
let sitemapLinks: string[] = [];
2024-04-15 17:01:47 -04:00
try {
2024-06-24 16:33:07 -03:00
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
2024-04-15 17:01:47 -04:00
if (response.status === 200) {
2024-05-15 17:13:04 -07:00
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
2024-04-15 17:01:47 -04:00
}
} catch (error) {
console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
2024-04-15 17:01:47 -04:00
}
2024-05-15 15:30:37 -07:00
2024-05-15 17:13:04 -07:00
if (sitemapLinks.length === 0) {
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try {
2024-06-24 16:33:07 -03:00
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
2024-05-15 17:13:04 -07:00
if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
}
} catch (error) {
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
2024-05-15 15:30:37 -07:00
}
}
2024-05-15 17:13:04 -07:00
const normalizedUrl = normalizeUrl(url);
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
2024-06-24 16:52:01 -03:00
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
2024-05-15 17:13:04 -07:00
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
sitemapLinks.push(url);
}
return sitemapLinks;
2024-04-15 17:01:47 -04:00
}
}