429 lines
13 KiB
TypeScript
Raw Normal View History

2024-08-13 20:51:43 +02:00
import axios, { AxiosError } from "axios";
2024-04-15 17:01:47 -04:00
import cheerio, { load } from "cheerio";
import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap";
import robotsParser from "robots-parser";
2024-06-15 16:43:37 -04:00
import { getURLDepth } from "./utils/maxDepthUtils";
2024-06-24 16:33:07 -03:00
import { axiosTimeout } from "../../../src/lib/timeout";
`WebScraper` refactor into `scrapeURL` (#714) * feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla <ericciarla@yahoo.com> Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
2024-11-07 20:57:33 +01:00
import { logger } from "../../../src/lib/logger";
import https from "https";
2024-04-15 17:01:47 -04:00
export class WebCrawler {
2024-07-24 14:31:25 +02:00
private jobId: string;
2024-04-15 17:01:47 -04:00
private initialUrl: string;
private baseUrl: string;
private includes: string[];
private excludes: string[];
private maxCrawledLinks: number;
2024-05-07 11:06:26 -03:00
private maxCrawledDepth: number;
2024-04-15 17:01:47 -04:00
private visited: Set<string> = new Set();
2024-05-14 12:12:40 -07:00
private crawledUrls: Map<string, string> = new Map();
2024-04-15 17:01:47 -04:00
private limit: number;
private robotsTxtUrl: string;
2024-08-13 20:51:43 +02:00
public robots: any;
2024-04-16 12:49:14 -04:00
private generateImgAltText: boolean;
2024-06-11 15:24:39 -03:00
private allowBackwardCrawling: boolean;
2024-06-28 17:23:40 -07:00
private allowExternalContentLinks: boolean;
2024-11-19 18:38:59 +01:00
private allowSubdomains: boolean;
2024-04-15 17:01:47 -04:00
constructor({
2024-07-24 14:31:25 +02:00
jobId,
2024-04-15 17:01:47 -04:00
initialUrl,
2024-11-13 21:38:44 +01:00
baseUrl,
2024-04-15 17:01:47 -04:00
includes,
excludes,
2024-05-10 12:15:54 -03:00
maxCrawledLinks = 10000,
2024-04-15 17:01:47 -04:00
limit = 10000,
2024-04-16 12:49:14 -04:00
generateImgAltText = false,
2024-05-07 11:06:26 -03:00
maxCrawledDepth = 10,
2024-06-28 17:23:40 -07:00
allowBackwardCrawling = false,
2024-11-19 18:38:59 +01:00
allowExternalContentLinks = false,
allowSubdomains = false,
2024-04-15 17:01:47 -04:00
}: {
2024-07-24 14:31:25 +02:00
jobId: string;
2024-04-15 17:01:47 -04:00
initialUrl: string;
2024-11-13 21:38:44 +01:00
baseUrl?: string;
2024-04-15 17:01:47 -04:00
includes?: string[];
excludes?: string[];
maxCrawledLinks?: number;
limit?: number;
2024-04-16 12:49:14 -04:00
generateImgAltText?: boolean;
2024-05-07 11:06:26 -03:00
maxCrawledDepth?: number;
2024-06-11 15:24:39 -03:00
allowBackwardCrawling?: boolean;
2024-06-28 17:23:40 -07:00
allowExternalContentLinks?: boolean;
2024-11-19 18:38:59 +01:00
allowSubdomains?: boolean;
2024-04-15 17:01:47 -04:00
}) {
2024-07-24 14:31:25 +02:00
this.jobId = jobId;
2024-04-15 17:01:47 -04:00
this.initialUrl = initialUrl;
2024-11-13 21:38:44 +01:00
this.baseUrl = baseUrl ?? new URL(initialUrl).origin;
this.includes = Array.isArray(includes) ? includes : [];
this.excludes = Array.isArray(excludes) ? excludes : [];
2024-04-15 17:01:47 -04:00
this.limit = limit;
this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
this.robots = robotsParser(this.robotsTxtUrl, "");
// Deprecated, use limit instead
this.maxCrawledLinks = maxCrawledLinks ?? limit;
2024-05-07 11:06:26 -03:00
this.maxCrawledDepth = maxCrawledDepth ?? 10;
2024-04-16 12:49:14 -04:00
this.generateImgAltText = generateImgAltText ?? false;
2024-06-11 15:24:39 -03:00
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
2024-06-28 17:23:40 -07:00
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
2024-11-19 18:38:59 +01:00
this.allowSubdomains = allowSubdomains ?? false;
2024-04-15 17:01:47 -04:00
}
2024-11-14 17:44:32 -05:00
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
// If the initial URL is a sitemap.xml, skip filtering
if (this.initialUrl.endsWith('sitemap.xml') && fromMap) {
return sitemapLinks.slice(0, limit);
}
2024-04-15 17:01:47 -04:00
return sitemapLinks
.filter((link) => {
let url: URL;
try {
url = new URL(link.trim(), this.baseUrl);
} catch (error) {
`WebScraper` refactor into `scrapeURL` (#714) * feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla <ericciarla@yahoo.com> Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
2024-11-07 20:57:33 +01:00
logger.debug(`Error processing link: ${link} | Error: ${error.message}`);
return false;
}
2024-04-15 17:01:47 -04:00
const path = url.pathname;
2024-06-14 19:40:37 -04:00
2024-06-15 16:43:37 -04:00
const depth = getURLDepth(url.toString());
2024-05-07 11:06:26 -03:00
2024-06-14 19:40:37 -04:00
2024-05-07 11:06:26 -03:00
// Check if the link exceeds the maximum depth allowed
if (depth > maxDepth) {
return false;
}
2024-04-15 17:01:47 -04:00
// Check if the link should be excluded
if (this.excludes.length > 0 && this.excludes[0] !== "") {
if (
this.excludes.some((excludePattern) =>
new RegExp(excludePattern).test(path)
)
) {
return false;
}
}
// Check if the link matches the include patterns, if any are specified
if (this.includes.length > 0 && this.includes[0] !== "") {
2024-05-15 15:30:37 -07:00
if (!this.includes.some((includePattern) =>
2024-04-15 17:01:47 -04:00
new RegExp(includePattern).test(path)
2024-05-15 15:30:37 -07:00
)) {
return false;
}
}
// Normalize the initial URL and the link to account for www and non-www versions
const normalizedInitialUrl = new URL(this.initialUrl);
2024-08-22 23:30:19 +02:00
let normalizedLink;
try {
normalizedLink = new URL(link);
} catch (_) {
return false;
}
2024-05-15 15:30:37 -07:00
const initialHostname = normalizedInitialUrl.hostname.replace(/^www\./, '');
const linkHostname = normalizedLink.hostname.replace(/^www\./, '');
// Ensure the protocol and hostname match, and the path starts with the initial URL's path
2024-06-28 17:23:40 -07:00
// commented to able to handling external link on allowExternalContentLinks
// if (linkHostname !== initialHostname) {
// return false;
// }
2024-04-15 17:01:47 -04:00
2024-06-11 15:24:39 -03:00
if (!this.allowBackwardCrawling) {
if (!normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)) {
return false;
}
}
2024-04-15 17:01:47 -04:00
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
// Check if the link is disallowed by robots.txt
if (!isAllowed) {
`WebScraper` refactor into `scrapeURL` (#714) * feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla <ericciarla@yahoo.com> Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
2024-11-07 20:57:33 +01:00
logger.debug(`Link disallowed by robots.txt: ${link}`);
2024-04-15 17:01:47 -04:00
return false;
}
2024-10-14 15:44:45 -03:00
if (this.isFile(link)) {
return false;
}
2024-04-15 17:01:47 -04:00
return true;
})
.slice(0, limit);
}
public async getRobotsTxt(skipTlsVerification = false): Promise<string> {
let extraArgs = {};
if(skipTlsVerification) {
extraArgs["httpsAgent"] = new https.Agent({
rejectUnauthorized: false
});
}
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout, ...extraArgs });
2024-08-13 20:51:43 +02:00
return response.data;
}
public importRobotsTxt(txt: string) {
this.robots = robotsParser(this.robotsTxtUrl, txt);
}
2024-11-14 17:44:32 -05:00
public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> {
`WebScraper` refactor into `scrapeURL` (#714) * feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla <ericciarla@yahoo.com> Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
2024-11-07 20:57:33 +01:00
logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
2024-08-13 20:51:43 +02:00
const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
2024-11-14 17:44:32 -05:00
if(fromMap && onlySitemap) {
return sitemapLinks.map(link => ({ url: link, html: "" }));
}
2024-08-13 20:51:43 +02:00
if (sitemapLinks.length > 0) {
2024-11-14 17:44:32 -05:00
let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap);
2024-08-13 20:51:43 +02:00
return filteredLinks.map(link => ({ url: link, html: "" }));
}
return null;
}
public filterURL(href: string, url: string): string | null {
let fullUrl = href;
if (!href.startsWith("http")) {
try {
fullUrl = new URL(href, url).toString();
} catch (_) {
return null;
}
}
let urlObj;
try {
urlObj = new URL(fullUrl);
} catch (_) {
return null;
2024-08-13 20:51:43 +02:00
}
const path = urlObj.pathname;
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
if (this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) &&
!this.matchesExcludes(path) &&
this.isRobotsAllowed(fullUrl)
) {
return fullUrl;
}
} else { // EXTERNAL LINKS
if (
this.isInternalLink(url) &&
this.allowExternalContentLinks &&
!this.isSocialMediaOrEmail(fullUrl) &&
!this.matchesExcludes(fullUrl, true) &&
!this.isExternalMainPage(fullUrl)
) {
return fullUrl;
}
}
2024-11-19 18:38:59 +01:00
if (this.allowSubdomains && !this.isSocialMediaOrEmail(fullUrl) && this.isSubdomain(fullUrl)) {
return fullUrl;
}
2024-08-13 20:51:43 +02:00
return null;
}
2024-08-16 23:29:30 +02:00
public extractLinksFromHTML(html: string, url: string) {
let links: string[] = [];
const $ = load(html);
$("a").each((_, element) => {
2024-11-20 20:18:40 +01:00
let href = $(element).attr("href");
2024-08-16 23:29:30 +02:00
if (href) {
2024-11-20 20:18:40 +01:00
if (href.match(/^https?:\/[^\/]/)) {
2024-11-20 20:19:16 +01:00
href = href.replace(/^https?:\//, "$&/");
2024-11-20 20:18:40 +01:00
}
2024-08-16 23:29:30 +02:00
const u = this.filterURL(href, url);
if (u !== null) {
links.push(u);
}
}
});
// Extract links from iframes with inline src
$("iframe").each((_, element) => {
const src = $(element).attr("src");
if (src && src.startsWith("data:text/html")) {
const iframeHtml = decodeURIComponent(src.split(",")[1]);
const iframeLinks = this.extractLinksFromHTML(iframeHtml, url);
links = links.concat(iframeLinks);
}
});
2024-08-16 23:29:30 +02:00
return links;
}
2024-06-14 13:44:54 -07:00
private isRobotsAllowed(url: string): boolean {
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
}
2024-04-15 17:01:47 -04:00
2024-06-28 17:23:40 -07:00
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
return this.excludes.some((pattern) => {
if (onlyDomains)
return this.matchesExcludesExternalDomains(url);
return this.excludes.some((pattern) => new RegExp(pattern).test(url));
});
}
// supported formats: "example.com/blog", "https://example.com", "blog.example.com", "example.com"
private matchesExcludesExternalDomains(url: string) {
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname;
const pathname = urlObj.pathname;
for (let domain of this.excludes) {
let domainObj = new URL('http://' + domain.replace(/^https?:\/\//, ''));
let domainHostname = domainObj.hostname;
let domainPathname = domainObj.pathname;
if (hostname === domainHostname || hostname.endsWith(`.${domainHostname}`)) {
if (pathname.startsWith(domainPathname)) {
return true;
}
}
}
return false;
} catch (e) {
return false;
}
}
private isExternalMainPage(url:string):boolean {
return !Boolean(url.split("/").slice(3).filter(subArray => subArray.length > 0).length)
2024-04-15 17:01:47 -04:00
}
private noSections(link: string): boolean {
return !link.includes("#");
}
private isInternalLink(link: string): boolean {
const urlObj = new URL(link, this.baseUrl);
2024-06-14 13:44:54 -07:00
const baseDomain = this.baseUrl.replace(/^https?:\/\//, "").replace(/^www\./, "").trim();
const linkDomain = urlObj.hostname.replace(/^www\./, "").trim();
return linkDomain === baseDomain;
2024-04-15 17:01:47 -04:00
}
2024-11-19 18:38:59 +01:00
private isSubdomain(link: string): boolean {
return new URL(link, this.baseUrl).hostname.endsWith("." + new URL(this.baseUrl).hostname.split(".").slice(-2).join("."));
}
public isFile(url: string): boolean {
2024-04-15 17:01:47 -04:00
const fileExtensions = [
".png",
".jpg",
".jpeg",
".gif",
".css",
".js",
".ico",
".svg",
2024-07-18 17:07:21 -03:00
".tiff",
2024-04-18 11:43:57 -03:00
// ".pdf",
2024-04-15 17:01:47 -04:00
".zip",
".exe",
".dmg",
".mp4",
".mp3",
".pptx",
2024-05-16 11:48:02 -07:00
// ".docx",
2024-04-15 17:01:47 -04:00
".xlsx",
".xml",
".avi",
".flv",
".woff",
".ttf",
".woff2",
".webp",
".inc"
2024-04-15 17:01:47 -04:00
];
2024-10-14 15:44:45 -03:00
try {
const urlWithoutQuery = url.split('?')[0].toLowerCase();
return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext));
} catch (error) {
`WebScraper` refactor into `scrapeURL` (#714) * feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla <ericciarla@yahoo.com> Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
2024-11-07 20:57:33 +01:00
logger.error(`Error processing URL in isFile: ${error}`);
2024-10-14 15:44:45 -03:00
return false;
}
2024-04-15 17:01:47 -04:00
}
private isSocialMediaOrEmail(url: string): boolean {
const socialMediaOrEmail = [
"facebook.com",
"twitter.com",
"linkedin.com",
"instagram.com",
"pinterest.com",
"mailto:",
2024-06-28 17:23:40 -07:00
"github.com",
"calendly.com",
"discord.gg",
"discord.com",
2024-04-15 17:01:47 -04:00
];
return socialMediaOrEmail.some((ext) => url.includes(ext));
}
private async tryFetchSitemapLinks(url: string): Promise<string[]> {
2024-05-15 17:13:04 -07:00
const normalizeUrl = (url: string) => {
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
return url;
};
2024-11-14 17:44:32 -05:00
const sitemapUrl = url.endsWith(".xml")
2024-04-15 17:01:47 -04:00
? url
: `${url}/sitemap.xml`;
2024-05-15 17:13:04 -07:00
let sitemapLinks: string[] = [];
2024-04-15 17:01:47 -04:00
try {
2024-06-24 16:33:07 -03:00
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
2024-04-15 17:01:47 -04:00
if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
}
} catch (error) {
`WebScraper` refactor into `scrapeURL` (#714) * feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla <ericciarla@yahoo.com> Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
2024-11-07 20:57:33 +01:00
logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
2024-08-13 20:51:43 +02:00
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
if (response) {
sitemapLinks = response;
}
2024-04-15 17:01:47 -04:00
}
}
2024-05-15 15:30:37 -07:00
2024-05-15 17:13:04 -07:00
if (sitemapLinks.length === 0) {
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try {
2024-06-24 16:33:07 -03:00
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
2024-05-15 17:13:04 -07:00
if (response.status === 200) {
2024-08-02 11:03:01 -03:00
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
2024-05-15 17:13:04 -07:00
}
} catch (error) {
`WebScraper` refactor into `scrapeURL` (#714) * feat: use strictNullChecking * feat: switch logger to Winston * feat(scrapeURL): first batch * fix(scrapeURL): error swallow * fix(scrapeURL): add timeout to EngineResultsTracker * fix(scrapeURL): report unexpected error to sentry * chore: remove unused modules * feat(transfomers/coerce): warn when a format's response is missing * feat(scrapeURL): feature flag priorities, engine quality sorting, PDF and DOCX support * (add note) * feat(scrapeURL): wip readme * feat(scrapeURL): LLM extract * feat(scrapeURL): better warnings * fix(scrapeURL/engines/fire-engine;playwright): fix screenshot * feat(scrapeURL): add forceEngine internal option * feat(scrapeURL/engines): scrapingbee * feat(scrapeURL/transformars): uploadScreenshot * feat(scrapeURL): more intense tests * bunch of stuff * get rid of WebScraper (mostly) * adapt batch scrape * add staging deploy workflow * fix yaml * fix logger issues * fix v1 test schema * feat(scrapeURL/fire-engine/chrome-cdp): remove wait inserts on actions * scrapeURL: v0 backwards compat * logger fixes * feat(scrapeurl): v0 returnOnlyUrls support * fix(scrapeURL/v0): URL leniency * fix(batch-scrape): ts non-nullable * fix(scrapeURL/fire-engine/chromecdp): fix wait action * fix(logger): remove error debug key * feat(requests.http): use dotenv expression * fix(scrapeURL/extractMetadata): extract custom metadata * fix crawl option conversion * feat(scrapeURL): Add retry logic to robustFetch * fix(scrapeURL): crawl stuff * fix(scrapeURL): LLM extract * fix(scrapeURL/v0): search fix * fix(tests/v0): grant larger response size to v0 crawl status * feat(scrapeURL): basic fetch engine * feat(scrapeURL): playwright engine * feat(scrapeURL): add url-specific parameters * Update readme and examples * added e2e tests for most parameters. Still a few actions, location and iframes to be done. * fixed type * Nick: * Update scrape.ts * Update index.ts * added actions and base64 check * Nick: skipTls feature flag? * 403 * todo * todo * fixes * yeet headers from url specific params * add warning when final engine has feature deficit * expose engine results tracker for ScrapeEvents implementation * ingest scrape events * fixed some tests * comment * Update index.test.ts * fixed rawHtml * Update index.test.ts * update comments * move geolocation to global f-e option, fix removeBase64Images * Nick: * trim url-specific params * Update index.ts --------- Co-authored-by: Eric Ciarla <ericciarla@yahoo.com> Co-authored-by: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
2024-11-07 20:57:33 +01:00
logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
2024-08-13 20:51:43 +02:00
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
}
2024-05-15 15:30:37 -07:00
}
}
2024-05-15 17:13:04 -07:00
const normalizedUrl = normalizeUrl(url);
const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
2024-06-24 16:52:01 -03:00
// has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
2024-05-15 17:13:04 -07:00
if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
sitemapLinks.push(url);
}
return sitemapLinks;
2024-04-15 17:01:47 -04:00
}
}