456 lines
16 KiB
TypeScript
Raw Normal View History

2024-04-15 17:01:47 -04:00
import * as cheerio from "cheerio";
import { extractMetadata } from "./utils/metadata";
import dotenv from "dotenv";
2024-07-03 18:01:17 -03:00
import {
Document,
PageOptions,
FireEngineResponse,
ExtractorOptions,
} from "../../lib/entities";
2024-04-15 17:01:47 -04:00
import { parseMarkdown } from "../../lib/html-to-markdown";
2024-04-28 11:34:25 -07:00
import { urlSpecificParams } from "./utils/custom/website_params";
import { fetchAndProcessPdf } from "./utils/pdfProcessor";
2024-06-04 12:15:39 -07:00
import { handleCustomScraping } from "./custom/handleCustomScraping";
import { removeUnwantedElements } from "./utils/removeUnwantedElements";
2024-07-03 18:01:17 -03:00
import { scrapWithFetch } from "./scrapers/fetch";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
import { scrapWithPlaywright } from "./scrapers/playwright";
import { scrapWithScrapingBee } from "./scrapers/scrapingBee";
import { extractLinks } from "./utils/utils";
2024-07-23 17:30:46 -03:00
import { Logger } from "../../lib/logger";
2024-07-24 14:31:25 +02:00
import { ScrapeEvents } from "../../lib/scrape-events";
2024-07-25 19:53:29 -04:00
import { clientSideError } from "../../strings";
2024-04-15 17:01:47 -04:00
dotenv.config();
2024-08-12 16:40:31 -03:00
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
2024-07-24 14:31:25 +02:00
export const baseScrapers = [
2024-08-12 16:40:31 -03:00
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
2024-09-02 23:32:23 -03:00
useFireEngine ? "fire-engine" : undefined,
useScrapingBee ? "scrapingBee" : undefined,
2024-08-12 17:57:00 -03:00
useFireEngine ? undefined : "playwright",
2024-08-12 16:40:31 -03:00
useScrapingBee ? "scrapingBeeLoad" : undefined,
2024-05-21 18:34:23 -07:00
"fetch",
2024-07-25 17:51:41 -04:00
].filter(Boolean);
2024-05-21 18:34:23 -07:00
2024-04-28 11:34:25 -07:00
export async function generateRequestParams(
url: string,
wait_browser: string = "domcontentloaded",
timeout: number = 15000
): Promise<any> {
const defaultParams = {
url: url,
params: { timeout: timeout, wait_browser: wait_browser },
headers: { "ScrapingService-Request": "TRUE" },
};
2024-04-28 12:44:00 -07:00
try {
2024-05-09 17:45:16 -07:00
const urlKey = new URL(url).hostname.replace(/^www\./, "");
2024-04-28 12:44:00 -07:00
if (urlSpecificParams.hasOwnProperty(urlKey)) {
return { ...defaultParams, ...urlSpecificParams[urlKey] };
} else {
return defaultParams;
}
} catch (error) {
2024-07-23 17:30:46 -03:00
Logger.error(`Error generating URL key: ${error}`);
2024-04-28 11:34:25 -07:00
return defaultParams;
}
}
2024-05-21 18:50:42 -07:00
/**
* Get the order of scrapers to be used for scraping a URL
* If the user doesn't have envs set for a specific scraper, it will be removed from the order.
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
* @returns The order of scrapers to be used for scraping a URL
*/
2024-05-31 15:39:54 -07:00
function getScrapingFallbackOrder(
defaultScraper?: string,
isWaitPresent: boolean = false,
isScreenshotPresent: boolean = false,
isHeadersPresent: boolean = false
) {
const availableScrapers = baseScrapers.filter((scraper) => {
2024-05-21 18:50:42 -07:00
switch (scraper) {
case "scrapingBee":
case "scrapingBeeLoad":
return !!process.env.SCRAPING_BEE_API_KEY;
case "fire-engine":
return !!process.env.FIRE_ENGINE_BETA_URL;
case "fire-engine;chrome-cdp":
return !!process.env.FIRE_ENGINE_BETA_URL;
2024-05-21 18:50:42 -07:00
case "playwright":
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
default:
return true;
}
});
2024-05-31 15:39:54 -07:00
let defaultOrder = [
2024-08-12 16:40:31 -03:00
useFireEngine ? "fire-engine;chrome-cdp" : undefined,
2024-09-02 23:32:23 -03:00
useFireEngine ? "fire-engine" : undefined,
useScrapingBee ? "scrapingBee" : undefined,
2024-08-12 16:40:31 -03:00
useScrapingBee ? "scrapingBeeLoad" : undefined,
2024-08-12 17:55:10 -03:00
useFireEngine ? undefined : "playwright",
2024-05-31 15:39:54 -07:00
"fetch",
2024-07-25 17:48:44 -04:00
].filter(Boolean);
2024-05-31 15:39:54 -07:00
2024-09-09 21:06:23 -03:00
// if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
// defaultOrder = [
// "fire-engine",
// useFireEngine ? undefined : "playwright",
// ...defaultOrder.filter(
// (scraper) => scraper !== "fire-engine" && scraper !== "playwright"
// ),
// ].filter(Boolean);
// }
2024-05-28 12:56:24 -07:00
2024-05-31 15:39:54 -07:00
const filteredDefaultOrder = defaultOrder.filter(
(scraper: (typeof baseScrapers)[number]) =>
availableScrapers.includes(scraper)
);
const uniqueScrapers = new Set(
defaultScraper
? [defaultScraper, ...filteredDefaultOrder, ...availableScrapers]
: [...filteredDefaultOrder, ...availableScrapers]
);
2024-05-21 18:34:23 -07:00
const scrapersInOrder = Array.from(uniqueScrapers);
2024-05-31 15:39:54 -07:00
return scrapersInOrder as (typeof baseScrapers)[number][];
2024-05-21 18:34:23 -07:00
}
2024-04-15 17:01:47 -04:00
export async function scrapSingleUrl(
2024-07-24 14:31:25 +02:00
jobId: string,
2024-04-15 17:01:47 -04:00
urlToScrap: string,
2024-08-22 15:15:45 -03:00
pageOptions: PageOptions,
extractorOptions?: ExtractorOptions,
existingHtml?: string,
2024-08-15 19:04:46 +02:00
priority?: number,
2024-08-19 16:41:54 -03:00
teamId?: string
2024-04-15 17:01:47 -04:00
): Promise<Document> {
2024-08-22 15:15:45 -03:00
pageOptions = {
includeMarkdown: pageOptions.includeMarkdown ?? true,
2024-08-29 20:08:06 -03:00
includeExtract: pageOptions.includeExtract ?? false,
2024-08-22 15:15:45 -03:00
onlyMainContent: pageOptions.onlyMainContent ?? false,
includeHtml: pageOptions.includeHtml ?? false,
includeRawHtml: pageOptions.includeRawHtml ?? false,
waitFor: pageOptions.waitFor ?? undefined,
screenshot: pageOptions.screenshot ?? false,
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
headers: pageOptions.headers ?? undefined,
includeLinks: pageOptions.includeLinks ?? true,
2024-08-28 14:07:28 -03:00
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
2024-08-22 15:15:45 -03:00
parsePDF: pageOptions.parsePDF ?? true,
removeTags: pageOptions.removeTags ?? [],
2024-08-28 14:07:28 -03:00
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
2024-09-05 13:57:26 -03:00
useFastMode: pageOptions.useFastMode ?? false,
2024-09-05 14:16:31 -03:00
disableJsDom: pageOptions.disableJsDom ?? false,
2024-09-05 13:57:26 -03:00
atsv: pageOptions.atsv ?? false
2024-08-22 15:15:45 -03:00
}
if (extractorOptions) {
extractorOptions = {
2024-08-28 13:17:22 -03:00
mode: extractorOptions?.mode ?? "llm-extraction-from-markdown",
2024-08-22 15:15:45 -03:00
}
}
if (!existingHtml) {
existingHtml = "";
}
2024-04-15 17:01:47 -04:00
urlToScrap = urlToScrap.trim();
2024-04-16 12:06:46 -04:00
const attemptScraping = async (
url: string,
2024-05-31 15:39:54 -07:00
method: (typeof baseScrapers)[number]
2024-06-28 15:51:18 -03:00
) => {
let scraperResponse: {
text: string;
screenshot: string;
metadata: { pageStatusCode?: number; pageError?: string | null };
} = { text: "", screenshot: "", metadata: {} };
2024-05-29 18:56:57 -04:00
let screenshot = "";
2024-07-24 14:31:25 +02:00
const timer = Date.now();
const logInsertPromise = ScrapeEvents.insert(jobId, {
type: "scrape",
url,
worker: process.env.FLY_MACHINE_ID,
2024-07-24 14:31:25 +02:00
method,
result: null,
});
2024-04-15 17:01:47 -04:00
switch (method) {
2024-05-21 18:34:23 -07:00
case "fire-engine":
case "fire-engine;chrome-cdp":
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
2024-08-19 16:41:54 -03:00
if (method === "fire-engine;chrome-cdp") {
engine = "chrome-cdp";
}
2024-05-21 18:50:42 -07:00
if (process.env.FIRE_ENGINE_BETA_URL) {
2024-06-28 15:45:16 -03:00
const response = await scrapWithFireEngine({
2024-05-31 15:39:54 -07:00
url,
2024-06-28 15:45:16 -03:00
waitFor: pageOptions.waitFor,
screenshot: pageOptions.screenshot,
fullPageScreenshot: pageOptions.fullPageScreenshot,
2024-06-28 15:45:16 -03:00
pageOptions: pageOptions,
2024-06-28 15:51:18 -03:00
headers: pageOptions.headers,
2024-07-18 13:19:44 -04:00
fireEngineOptions: {
engine: engine,
2024-08-19 16:41:54 -03:00
atsv: pageOptions.atsv,
2024-09-05 14:16:31 -03:00
disableJsDom: pageOptions.disableJsDom,
2024-08-15 19:04:46 +02:00
},
priority,
2024-08-19 16:41:54 -03:00
teamId,
2024-06-28 15:51:18 -03:00
});
scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
2024-05-21 18:50:42 -07:00
}
2024-04-16 12:06:46 -04:00
break;
case "scrapingBee":
2024-04-15 17:01:47 -04:00
if (process.env.SCRAPING_BEE_API_KEY) {
const response = await scrapWithScrapingBee(
2024-04-28 11:34:25 -07:00
url,
"domcontentloaded",
pageOptions.fallback === false ? 7000 : 15000
);
scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
2024-04-15 17:01:47 -04:00
}
break;
2024-04-16 12:06:46 -04:00
case "playwright":
2024-04-15 17:01:47 -04:00
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
2024-06-28 15:51:18 -03:00
const response = await scrapWithPlaywright(
url,
pageOptions.waitFor,
pageOptions.headers
);
scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
2024-04-15 17:01:47 -04:00
}
break;
2024-04-16 12:06:46 -04:00
case "scrapingBeeLoad":
2024-04-15 17:01:47 -04:00
if (process.env.SCRAPING_BEE_API_KEY) {
const response = await scrapWithScrapingBee(url, "networkidle2");
scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
2024-04-15 17:01:47 -04:00
}
break;
2024-04-16 12:06:46 -04:00
case "fetch":
const response = await scrapWithFetch(url);
scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError;
2024-04-15 17:01:47 -04:00
break;
}
2024-06-28 15:51:18 -03:00
let customScrapedContent: FireEngineResponse | null = null;
2024-06-04 12:15:39 -07:00
// Check for custom scraping conditions
2024-06-28 15:51:18 -03:00
const customScraperResult = await handleCustomScraping(
scraperResponse.text,
url
);
2024-06-04 12:15:39 -07:00
2024-06-28 15:51:18 -03:00
if (customScraperResult) {
switch (customScraperResult.scraper) {
case "fire-engine":
2024-06-28 15:51:18 -03:00
customScrapedContent = await scrapWithFireEngine({
url: customScraperResult.url,
waitFor: customScraperResult.waitAfterLoad,
screenshot: false,
pageOptions: customScraperResult.pageOptions,
});
2024-06-05 15:34:42 -03:00
if (screenshot) {
customScrapedContent.screenshot = screenshot;
}
2024-06-05 15:02:28 -03:00
break;
case "pdf":
2024-06-28 15:51:18 -03:00
const { content, pageStatusCode, pageError } =
await fetchAndProcessPdf(
customScraperResult.url,
pageOptions?.parsePDF
);
customScrapedContent = {
html: content,
screenshot,
pageStatusCode,
pageError,
};
2024-06-05 15:02:28 -03:00
break;
}
2024-06-04 12:15:39 -07:00
}
if (customScrapedContent) {
scraperResponse.text = customScrapedContent.html;
2024-06-03 15:24:40 -03:00
screenshot = customScrapedContent.screenshot;
}
2024-05-09 17:45:16 -07:00
//* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
2024-07-24 14:31:25 +02:00
const text = await parseMarkdown(cleanedHtml);
const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, {
response_size: scraperResponse.text.length,
success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100),
2024-07-24 14:31:25 +02:00
error: scraperResponse.metadata.pageError,
response_code: scraperResponse.metadata.pageStatusCode,
time_taken: Date.now() - timer,
});
return {
2024-07-24 14:31:25 +02:00
text,
html: cleanedHtml,
rawHtml: scraperResponse.text,
screenshot: scraperResponse.screenshot,
pageStatusCode: scraperResponse.metadata.pageStatusCode,
2024-06-28 15:51:18 -03:00
pageError: scraperResponse.metadata.pageError || undefined,
};
2024-04-15 17:01:47 -04:00
};
2024-06-28 15:51:18 -03:00
let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = {
text: "",
html: "",
rawHtml: "",
screenshot: "",
pageStatusCode: 200,
pageError: undefined,
};
2024-04-15 17:01:47 -04:00
try {
2024-05-09 17:45:16 -07:00
let urlKey = urlToScrap;
try {
urlKey = new URL(urlToScrap).hostname.replace(/^www\./, "");
} catch (error) {
2024-07-23 17:30:46 -03:00
Logger.error(`Invalid URL key, trying: ${urlToScrap}`);
2024-04-23 15:28:32 -07:00
}
2024-05-09 17:45:16 -07:00
const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? "";
2024-05-31 15:39:54 -07:00
const scrapersInOrder = getScrapingFallbackOrder(
defaultScraper,
pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0,
2024-08-06 09:34:43 -03:00
pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true),
2024-05-31 15:39:54 -07:00
pageOptions && pageOptions.headers && pageOptions.headers !== undefined
);
2024-05-09 17:45:16 -07:00
for (const scraper of scrapersInOrder) {
2024-05-13 20:45:11 -07:00
// If exists text coming from crawler, use it
2024-07-25 19:53:29 -04:00
if (existingHtml && existingHtml.trim().length >= 100 && !existingHtml.includes(clientSideError)) {
2024-05-15 11:28:20 -07:00
let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions);
text = await parseMarkdown(cleanedHtml);
html = cleanedHtml;
2024-05-13 20:45:11 -07:00
break;
}
const attempt = await attemptScraping(urlToScrap, scraper);
2024-06-28 15:51:18 -03:00
text = attempt.text ?? "";
html = attempt.html ?? "";
rawHtml = attempt.rawHtml ?? "";
screenshot = attempt.screenshot ?? "";
2024-07-03 18:01:17 -03:00
2024-06-14 09:46:55 -03:00
if (attempt.pageStatusCode) {
pageStatusCode = attempt.pageStatusCode;
}
2024-07-02 10:51:35 -03:00
if (attempt.pageError && attempt.pageStatusCode >= 400) {
2024-06-14 09:46:55 -03:00
pageError = attempt.pageError;
2024-07-03 18:38:17 -03:00
} else if (attempt && attempt.pageStatusCode && attempt.pageStatusCode < 400) {
2024-07-01 18:21:15 -03:00
pageError = undefined;
2024-06-14 09:46:55 -03:00
}
2024-06-28 15:51:18 -03:00
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
2024-07-23 17:30:46 -03:00
break;
}
2024-08-06 18:00:56 +02:00
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {
2024-07-23 17:30:46 -03:00
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`);
break;
2024-05-21 18:34:23 -07:00
}
2024-07-23 17:30:46 -03:00
// const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
// if (nextScraperIndex < scrapersInOrder.length) {
// Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`);
// }
2024-04-15 17:01:47 -04:00
}
2024-05-09 17:52:46 -07:00
if (!text) {
2024-05-09 17:45:16 -07:00
throw new Error(`All scraping methods failed for URL: ${urlToScrap}`);
2024-04-15 17:01:47 -04:00
}
const soup = cheerio.load(rawHtml);
2024-04-15 17:01:47 -04:00
const metadata = extractMetadata(soup, urlToScrap);
2024-05-29 18:56:57 -04:00
let linksOnPage: string[] | undefined;
2024-08-16 15:14:37 -03:00
if (pageOptions.includeLinks) {
linksOnPage = extractLinks(rawHtml, urlToScrap);
}
2024-05-29 18:56:57 -04:00
let document: Document;
2024-05-31 15:39:54 -07:00
if (screenshot && screenshot.length > 0) {
2024-05-29 18:56:57 -04:00
document = {
content: text,
2024-08-29 20:08:06 -03:00
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
2024-05-29 18:56:57 -04:00
html: pageOptions.includeHtml ? html : undefined,
2024-07-03 18:01:17 -03:00
rawHtml:
pageOptions.includeRawHtml ||
2024-08-29 20:08:06 -03:00
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
2024-07-03 18:01:17 -03:00
? rawHtml
: undefined,
2024-08-16 15:14:37 -03:00
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
2024-05-31 15:39:54 -07:00
metadata: {
...metadata,
screenshot: screenshot,
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
2024-06-28 15:51:18 -03:00
pageError: pageError,
2024-05-31 15:39:54 -07:00
},
};
} else {
2024-05-29 18:56:57 -04:00
document = {
content: text,
2024-08-29 20:08:06 -03:00
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? text : undefined,
2024-05-29 18:56:57 -04:00
html: pageOptions.includeHtml ? html : undefined,
2024-07-03 18:01:17 -03:00
rawHtml:
pageOptions.includeRawHtml ||
2024-08-29 20:08:06 -03:00
(extractorOptions?.mode === "llm-extraction-from-raw-html" && pageOptions.includeExtract)
2024-07-03 18:01:17 -03:00
? rawHtml
: undefined,
metadata: {
...metadata,
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
2024-06-28 15:51:18 -03:00
pageError: pageError,
},
2024-08-16 15:14:37 -03:00
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
2024-05-31 15:39:54 -07:00
};
2024-05-29 18:56:57 -04:00
}
2024-05-09 17:45:16 -07:00
return document;
2024-04-15 17:01:47 -04:00
} catch (error) {
2024-07-23 17:30:46 -03:00
Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`);
2024-07-24 14:31:25 +02:00
ScrapeEvents.insert(jobId, {
type: "error",
message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error),
stack: error.stack,
});
2024-04-15 17:01:47 -04:00
return {
content: "",
2024-08-29 20:08:06 -03:00
markdown: pageOptions.includeMarkdown || pageOptions.includeExtract ? "" : undefined,
2024-05-06 19:45:56 -03:00
html: "",
2024-08-16 15:14:37 -03:00
linksOnPage: pageOptions.includeLinks ? [] : undefined,
2024-06-14 09:46:55 -03:00
metadata: {
sourceURL: urlToScrap,
pageStatusCode: pageStatusCode,
2024-06-28 15:51:18 -03:00
pageError: pageError,
2024-06-14 09:46:55 -03:00
},
2024-04-15 17:01:47 -04:00
} as Document;
}
}