2024-04-15 17:01:47 -04:00
import * as cheerio from "cheerio" ;
import { extractMetadata } from "./utils/metadata" ;
import dotenv from "dotenv" ;
2024-07-03 18:01:17 -03:00
import {
Document ,
PageOptions ,
FireEngineResponse ,
ExtractorOptions ,
} from "../../lib/entities" ;
2024-04-15 17:01:47 -04:00
import { parseMarkdown } from "../../lib/html-to-markdown" ;
2024-04-28 11:34:25 -07:00
import { urlSpecificParams } from "./utils/custom/website_params" ;
2024-05-13 09:13:42 -03:00
import { fetchAndProcessPdf } from "./utils/pdfProcessor" ;
2024-06-04 12:15:39 -07:00
import { handleCustomScraping } from "./custom/handleCustomScraping" ;
2024-06-18 09:46:42 -03:00
import { removeUnwantedElements } from "./utils/removeUnwantedElements" ;
2024-07-03 18:01:17 -03:00
import { scrapWithFetch } from "./scrapers/fetch" ;
import { scrapWithFireEngine } from "./scrapers/fireEngine" ;
import { scrapWithPlaywright } from "./scrapers/playwright" ;
import { scrapWithScrapingBee } from "./scrapers/scrapingBee" ;
2024-07-17 11:29:05 -07:00
import { extractLinks } from "./utils/utils" ;
2024-07-23 17:30:46 -03:00
import { Logger } from "../../lib/logger" ;
2024-07-24 14:31:25 +02:00
import { ScrapeEvents } from "../../lib/scrape-events" ;
2024-07-25 19:53:29 -04:00
import { clientSideError } from "../../strings" ;
2024-04-15 17:01:47 -04:00
dotenv . config ( ) ;
2024-08-12 16:40:31 -03:00
const useScrapingBee = process . env . SCRAPING_BEE_API_KEY !== '' && process . env . SCRAPING_BEE_API_KEY !== undefined ;
const useFireEngine = process . env . FIRE_ENGINE_BETA_URL !== '' && process . env . FIRE_ENGINE_BETA_URL !== undefined ;
2024-08-09 15:29:18 -07:00
2024-07-24 14:31:25 +02:00
export const baseScrapers = [
2024-08-12 16:40:31 -03:00
useFireEngine ? "fire-engine;chrome-cdp" : undefined ,
2024-09-02 23:32:23 -03:00
useFireEngine ? "fire-engine" : undefined ,
2024-08-12 16:40:31 -03:00
useScrapingBee ? "scrapingBee" : undefined ,
2024-08-12 17:57:00 -03:00
useFireEngine ? undefined : "playwright" ,
2024-08-12 16:40:31 -03:00
useScrapingBee ? "scrapingBeeLoad" : undefined ,
2024-05-21 18:34:23 -07:00
"fetch" ,
2024-07-25 17:51:41 -04:00
] . filter ( Boolean ) ;
2024-05-21 18:34:23 -07:00
2024-04-28 11:34:25 -07:00
export async function generateRequestParams (
url : string ,
wait_browser : string = "domcontentloaded" ,
timeout : number = 15000
) : Promise < any > {
const defaultParams = {
url : url ,
params : { timeout : timeout , wait_browser : wait_browser } ,
headers : { "ScrapingService-Request" : "TRUE" } ,
} ;
2024-04-28 12:44:00 -07:00
try {
2024-05-09 17:45:16 -07:00
const urlKey = new URL ( url ) . hostname . replace ( /^www\./ , "" ) ;
2024-04-28 12:44:00 -07:00
if ( urlSpecificParams . hasOwnProperty ( urlKey ) ) {
return { . . . defaultParams , . . . urlSpecificParams [ urlKey ] } ;
} else {
return defaultParams ;
}
} catch ( error ) {
2024-07-23 17:30:46 -03:00
Logger . error ( ` Error generating URL key: ${ error } ` ) ;
2024-04-28 11:34:25 -07:00
return defaultParams ;
}
}
2024-05-21 18:50:42 -07:00
/ * *
* Get the order of scrapers to be used for scraping a URL
* If the user doesn ' t have envs set for a specific scraper , it will be removed from the order .
* @param defaultScraper The default scraper to use if the URL does not have a specific scraper order defined
* @returns The order of scrapers to be used for scraping a URL
* /
2024-05-31 15:39:54 -07:00
function getScrapingFallbackOrder (
defaultScraper? : string ,
isWaitPresent : boolean = false ,
isScreenshotPresent : boolean = false ,
isHeadersPresent : boolean = false
) {
const availableScrapers = baseScrapers . filter ( ( scraper ) = > {
2024-05-21 18:50:42 -07:00
switch ( scraper ) {
case "scrapingBee" :
case "scrapingBeeLoad" :
return ! ! process . env . SCRAPING_BEE_API_KEY ;
case "fire-engine" :
return ! ! process . env . FIRE_ENGINE_BETA_URL ;
2024-07-15 18:40:43 +03:00
case "fire-engine;chrome-cdp" :
return ! ! process . env . FIRE_ENGINE_BETA_URL ;
2024-05-21 18:50:42 -07:00
case "playwright" :
return ! ! process . env . PLAYWRIGHT_MICROSERVICE_URL ;
default :
return true ;
}
} ) ;
2024-05-31 15:39:54 -07:00
let defaultOrder = [
2024-08-12 16:40:31 -03:00
useFireEngine ? "fire-engine;chrome-cdp" : undefined ,
2024-09-02 23:32:23 -03:00
useFireEngine ? "fire-engine" : undefined ,
2024-08-12 16:40:31 -03:00
useScrapingBee ? "scrapingBee" : undefined ,
useScrapingBee ? "scrapingBeeLoad" : undefined ,
2024-08-12 17:55:10 -03:00
useFireEngine ? undefined : "playwright" ,
2024-05-31 15:39:54 -07:00
"fetch" ,
2024-07-25 17:48:44 -04:00
] . filter ( Boolean ) ;
2024-05-31 15:39:54 -07:00
2024-09-09 21:06:23 -03:00
// if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
// defaultOrder = [
// "fire-engine",
// useFireEngine ? undefined : "playwright",
// ...defaultOrder.filter(
// (scraper) => scraper !== "fire-engine" && scraper !== "playwright"
// ),
// ].filter(Boolean);
// }
2024-05-28 12:56:24 -07:00
2024-05-31 15:39:54 -07:00
const filteredDefaultOrder = defaultOrder . filter (
( scraper : ( typeof baseScrapers ) [ number ] ) = >
availableScrapers . includes ( scraper )
) ;
const uniqueScrapers = new Set (
defaultScraper
? [ defaultScraper , . . . filteredDefaultOrder , . . . availableScrapers ]
: [ . . . filteredDefaultOrder , . . . availableScrapers ]
) ;
2024-06-14 15:14:01 -03:00
2024-05-21 18:34:23 -07:00
const scrapersInOrder = Array . from ( uniqueScrapers ) ;
2024-05-31 15:39:54 -07:00
return scrapersInOrder as ( typeof baseScrapers ) [ number ] [ ] ;
2024-05-21 18:34:23 -07:00
}
2024-07-16 18:38:03 -07:00
2024-04-15 17:01:47 -04:00
export async function scrapSingleUrl (
2024-07-24 14:31:25 +02:00
jobId : string ,
2024-04-15 17:01:47 -04:00
urlToScrap : string ,
2024-08-22 15:15:45 -03:00
pageOptions : PageOptions ,
extractorOptions? : ExtractorOptions ,
existingHtml? : string ,
2024-08-15 19:04:46 +02:00
priority? : number ,
2024-08-19 16:41:54 -03:00
teamId? : string
2024-04-15 17:01:47 -04:00
) : Promise < Document > {
2024-08-22 15:15:45 -03:00
pageOptions = {
includeMarkdown : pageOptions.includeMarkdown ? ? true ,
2024-08-29 20:08:06 -03:00
includeExtract : pageOptions.includeExtract ? ? false ,
2024-08-22 15:15:45 -03:00
onlyMainContent : pageOptions.onlyMainContent ? ? false ,
includeHtml : pageOptions.includeHtml ? ? false ,
includeRawHtml : pageOptions.includeRawHtml ? ? false ,
waitFor : pageOptions.waitFor ? ? undefined ,
screenshot : pageOptions.screenshot ? ? false ,
fullPageScreenshot : pageOptions.fullPageScreenshot ? ? false ,
headers : pageOptions.headers ? ? undefined ,
includeLinks : pageOptions.includeLinks ? ? true ,
2024-08-28 14:07:28 -03:00
replaceAllPathsWithAbsolutePaths : pageOptions.replaceAllPathsWithAbsolutePaths ? ? true ,
2024-08-22 15:15:45 -03:00
parsePDF : pageOptions.parsePDF ? ? true ,
removeTags : pageOptions.removeTags ? ? [ ] ,
2024-08-28 14:07:28 -03:00
onlyIncludeTags : pageOptions.onlyIncludeTags ? ? [ ] ,
2024-09-05 13:57:26 -03:00
useFastMode : pageOptions.useFastMode ? ? false ,
2024-09-05 14:16:31 -03:00
disableJsDom : pageOptions.disableJsDom ? ? false ,
2024-09-05 13:57:26 -03:00
atsv : pageOptions.atsv ? ? false
2024-08-22 15:15:45 -03:00
}
if ( extractorOptions ) {
extractorOptions = {
2024-08-28 13:17:22 -03:00
mode : extractorOptions?.mode ? ? "llm-extraction-from-markdown" ,
2024-08-22 15:15:45 -03:00
}
}
if ( ! existingHtml ) {
existingHtml = "" ;
}
2024-04-15 17:01:47 -04:00
urlToScrap = urlToScrap . trim ( ) ;
2024-04-16 12:06:46 -04:00
const attemptScraping = async (
url : string ,
2024-05-31 15:39:54 -07:00
method : ( typeof baseScrapers ) [ number ]
2024-06-28 15:51:18 -03:00
) = > {
let scraperResponse : {
text : string ;
screenshot : string ;
metadata : { pageStatusCode? : number ; pageError? : string | null } ;
} = { text : "" , screenshot : "" , metadata : { } } ;
2024-05-29 18:56:57 -04:00
let screenshot = "" ;
2024-07-15 18:40:43 +03:00
2024-07-24 14:31:25 +02:00
const timer = Date . now ( ) ;
const logInsertPromise = ScrapeEvents . insert ( jobId , {
type : "scrape" ,
2024-07-24 16:43:39 +02:00
url ,
worker : process.env.FLY_MACHINE_ID ,
2024-07-24 14:31:25 +02:00
method ,
result : null ,
} ) ;
2024-04-15 17:01:47 -04:00
switch ( method ) {
2024-05-21 18:34:23 -07:00
case "fire-engine" :
2024-07-15 18:40:43 +03:00
case "fire-engine;chrome-cdp" :
let engine : "playwright" | "chrome-cdp" | "tlsclient" = "playwright" ;
2024-08-19 16:41:54 -03:00
if ( method === "fire-engine;chrome-cdp" ) {
2024-07-15 18:40:43 +03:00
engine = "chrome-cdp" ;
}
2024-05-21 18:50:42 -07:00
if ( process . env . FIRE_ENGINE_BETA_URL ) {
2024-06-28 15:45:16 -03:00
const response = await scrapWithFireEngine ( {
2024-05-31 15:39:54 -07:00
url ,
2024-06-28 15:45:16 -03:00
waitFor : pageOptions.waitFor ,
screenshot : pageOptions.screenshot ,
2024-08-05 18:17:37 -03:00
fullPageScreenshot : pageOptions.fullPageScreenshot ,
2024-06-28 15:45:16 -03:00
pageOptions : pageOptions ,
2024-06-28 15:51:18 -03:00
headers : pageOptions.headers ,
2024-07-18 13:19:44 -04:00
fireEngineOptions : {
engine : engine ,
2024-08-19 16:41:54 -03:00
atsv : pageOptions.atsv ,
2024-09-05 14:16:31 -03:00
disableJsDom : pageOptions.disableJsDom ,
2024-08-15 19:04:46 +02:00
} ,
priority ,
2024-08-19 16:41:54 -03:00
teamId ,
2024-06-28 15:51:18 -03:00
} ) ;
2024-06-13 17:08:40 -03:00
scraperResponse . text = response . html ;
scraperResponse . screenshot = response . screenshot ;
scraperResponse . metadata . pageStatusCode = response . pageStatusCode ;
scraperResponse . metadata . pageError = response . pageError ;
2024-05-21 18:50:42 -07:00
}
2024-04-16 12:06:46 -04:00
break ;
case "scrapingBee" :
2024-04-15 17:01:47 -04:00
if ( process . env . SCRAPING_BEE_API_KEY ) {
2024-06-13 17:08:40 -03:00
const response = await scrapWithScrapingBee (
2024-04-28 11:34:25 -07:00
url ,
"domcontentloaded" ,
pageOptions . fallback === false ? 7000 : 15000
) ;
2024-06-13 17:08:40 -03:00
scraperResponse . text = response . content ;
scraperResponse . metadata . pageStatusCode = response . pageStatusCode ;
scraperResponse . metadata . pageError = response . pageError ;
2024-04-15 17:01:47 -04:00
}
break ;
2024-04-16 12:06:46 -04:00
case "playwright" :
2024-04-15 17:01:47 -04:00
if ( process . env . PLAYWRIGHT_MICROSERVICE_URL ) {
2024-06-28 15:51:18 -03:00
const response = await scrapWithPlaywright (
url ,
pageOptions . waitFor ,
pageOptions . headers
) ;
2024-06-13 17:08:40 -03:00
scraperResponse . text = response . content ;
scraperResponse . metadata . pageStatusCode = response . pageStatusCode ;
scraperResponse . metadata . pageError = response . pageError ;
2024-04-15 17:01:47 -04:00
}
break ;
2024-04-16 12:06:46 -04:00
case "scrapingBeeLoad" :
2024-04-15 17:01:47 -04:00
if ( process . env . SCRAPING_BEE_API_KEY ) {
2024-06-13 17:08:40 -03:00
const response = await scrapWithScrapingBee ( url , "networkidle2" ) ;
scraperResponse . text = response . content ;
scraperResponse . metadata . pageStatusCode = response . pageStatusCode ;
scraperResponse . metadata . pageError = response . pageError ;
2024-04-15 17:01:47 -04:00
}
break ;
2024-04-16 12:06:46 -04:00
case "fetch" :
2024-06-13 17:08:40 -03:00
const response = await scrapWithFetch ( url ) ;
scraperResponse . text = response . content ;
scraperResponse . metadata . pageStatusCode = response . pageStatusCode ;
scraperResponse . metadata . pageError = response . pageError ;
2024-04-15 17:01:47 -04:00
break ;
}
2024-04-28 13:59:35 -07:00
2024-06-28 15:51:18 -03:00
let customScrapedContent : FireEngineResponse | null = null ;
2024-06-04 12:15:39 -07:00
2024-05-29 13:39:43 -03:00
// Check for custom scraping conditions
2024-06-28 15:51:18 -03:00
const customScraperResult = await handleCustomScraping (
scraperResponse . text ,
url
) ;
2024-06-04 12:15:39 -07:00
2024-06-28 15:51:18 -03:00
if ( customScraperResult ) {
2024-06-04 17:47:28 -03:00
switch ( customScraperResult . scraper ) {
case "fire-engine" :
2024-06-28 15:51:18 -03:00
customScrapedContent = await scrapWithFireEngine ( {
url : customScraperResult.url ,
waitFor : customScraperResult.waitAfterLoad ,
screenshot : false ,
pageOptions : customScraperResult.pageOptions ,
} ) ;
2024-06-05 15:34:42 -03:00
if ( screenshot ) {
customScrapedContent . screenshot = screenshot ;
}
2024-06-05 15:02:28 -03:00
break ;
2024-06-04 17:47:28 -03:00
case "pdf" :
2024-06-28 15:51:18 -03:00
const { content , pageStatusCode , pageError } =
await fetchAndProcessPdf (
customScraperResult . url ,
pageOptions ? . parsePDF
) ;
customScrapedContent = {
html : content ,
screenshot ,
pageStatusCode ,
pageError ,
} ;
2024-06-05 15:02:28 -03:00
break ;
2024-06-04 17:47:28 -03:00
}
2024-06-04 12:15:39 -07:00
}
2024-05-29 13:39:43 -03:00
if ( customScrapedContent ) {
2024-06-13 17:08:40 -03:00
scraperResponse . text = customScrapedContent . html ;
2024-06-03 15:24:40 -03:00
screenshot = customScrapedContent . screenshot ;
2024-05-29 13:39:43 -03:00
}
2024-05-09 17:45:16 -07:00
//* TODO: add an optional to return markdown or structured/extracted content
2024-06-13 17:08:40 -03:00
let cleanedHtml = removeUnwantedElements ( scraperResponse . text , pageOptions ) ;
2024-07-24 14:31:25 +02:00
const text = await parseMarkdown ( cleanedHtml ) ;
const insertedLogId = await logInsertPromise ;
ScrapeEvents . updateScrapeResult ( insertedLogId , {
2024-07-24 16:43:39 +02:00
response_size : scraperResponse.text.length ,
2024-07-26 12:00:52 -03:00
success : ! ( scraperResponse . metadata . pageStatusCode && scraperResponse . metadata . pageStatusCode >= 400 ) && ! ! text && ( text . trim ( ) . length >= 100 ) ,
2024-07-24 14:31:25 +02:00
error : scraperResponse.metadata.pageError ,
response_code : scraperResponse.metadata.pageStatusCode ,
time_taken : Date.now ( ) - timer ,
} ) ;
2024-06-13 17:08:40 -03:00
return {
2024-07-24 14:31:25 +02:00
text ,
2024-06-18 16:26:54 -03:00
html : cleanedHtml ,
2024-06-25 15:21:14 -03:00
rawHtml : scraperResponse.text ,
2024-06-13 17:08:40 -03:00
screenshot : scraperResponse.screenshot ,
pageStatusCode : scraperResponse.metadata.pageStatusCode ,
2024-06-28 15:51:18 -03:00
pageError : scraperResponse.metadata.pageError || undefined ,
2024-06-13 17:08:40 -03:00
} ;
2024-04-15 17:01:47 -04:00
} ;
2024-06-25 15:21:14 -03:00
2024-06-28 15:51:18 -03:00
let { text , html , rawHtml , screenshot , pageStatusCode , pageError } = {
text : "" ,
html : "" ,
rawHtml : "" ,
screenshot : "" ,
pageStatusCode : 200 ,
pageError : undefined ,
} ;
2024-04-15 17:01:47 -04:00
try {
2024-05-09 17:45:16 -07:00
let urlKey = urlToScrap ;
try {
urlKey = new URL ( urlToScrap ) . hostname . replace ( /^www\./ , "" ) ;
} catch ( error ) {
2024-07-23 17:30:46 -03:00
Logger . error ( ` Invalid URL key, trying: ${ urlToScrap } ` ) ;
2024-04-23 15:28:32 -07:00
}
2024-05-09 17:45:16 -07:00
const defaultScraper = urlSpecificParams [ urlKey ] ? . defaultScraper ? ? "" ;
2024-05-31 15:39:54 -07:00
const scrapersInOrder = getScrapingFallbackOrder (
defaultScraper ,
pageOptions && pageOptions . waitFor && pageOptions . waitFor > 0 ,
2024-08-06 09:34:43 -03:00
pageOptions && ( pageOptions . screenshot || pageOptions . fullPageScreenshot ) && ( pageOptions . screenshot === true || pageOptions . fullPageScreenshot === true ) ,
2024-05-31 15:39:54 -07:00
pageOptions && pageOptions . headers && pageOptions . headers !== undefined
) ;
2024-05-09 17:45:16 -07:00
for ( const scraper of scrapersInOrder ) {
2024-05-13 20:45:11 -07:00
// If exists text coming from crawler, use it
2024-07-25 19:53:29 -04:00
if ( existingHtml && existingHtml . trim ( ) . length >= 100 && ! existingHtml . includes ( clientSideError ) ) {
2024-05-15 11:28:20 -07:00
let cleanedHtml = removeUnwantedElements ( existingHtml , pageOptions ) ;
text = await parseMarkdown ( cleanedHtml ) ;
2024-06-18 16:26:54 -03:00
html = cleanedHtml ;
2024-05-13 20:45:11 -07:00
break ;
}
2024-06-13 17:08:40 -03:00
const attempt = await attemptScraping ( urlToScrap , scraper ) ;
2024-06-28 15:51:18 -03:00
text = attempt . text ? ? "" ;
html = attempt . html ? ? "" ;
rawHtml = attempt . rawHtml ? ? "" ;
screenshot = attempt . screenshot ? ? "" ;
2024-07-03 18:01:17 -03:00
2024-06-14 09:46:55 -03:00
if ( attempt . pageStatusCode ) {
pageStatusCode = attempt . pageStatusCode ;
}
2024-07-02 10:51:35 -03:00
if ( attempt . pageError && attempt . pageStatusCode >= 400 ) {
2024-06-14 09:46:55 -03:00
pageError = attempt . pageError ;
2024-07-03 18:38:17 -03:00
} else if ( attempt && attempt . pageStatusCode && attempt . pageStatusCode < 400 ) {
2024-07-01 18:21:15 -03:00
pageError = undefined ;
2024-06-14 09:46:55 -03:00
}
2024-06-28 15:51:18 -03:00
2024-08-20 22:24:18 +02:00
if ( ( text && text . trim ( ) . length >= 100 ) || ( typeof screenshot === "string" && screenshot . length > 0 ) ) {
Logger . debug ( ` ⛏️ ${ scraper } : Successfully scraped ${ urlToScrap } with text length >= 100 or screenshot, breaking ` ) ;
2024-07-23 17:30:46 -03:00
break ;
}
2024-08-06 18:00:56 +02:00
if ( pageStatusCode && ( pageStatusCode == 404 || pageStatusCode == 500 ) ) {
2024-07-23 17:30:46 -03:00
Logger . debug ( ` ⛏️ ${ scraper } : Successfully scraped ${ urlToScrap } with status code 404, breaking ` ) ;
break ;
2024-05-21 18:34:23 -07:00
}
2024-07-23 17:30:46 -03:00
// const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
// if (nextScraperIndex < scrapersInOrder.length) {
// Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`);
// }
2024-04-15 17:01:47 -04:00
}
2024-05-09 17:52:46 -07:00
if ( ! text ) {
2024-05-09 17:45:16 -07:00
throw new Error ( ` All scraping methods failed for URL: ${ urlToScrap } ` ) ;
2024-04-15 17:01:47 -04:00
}
2024-06-25 15:21:14 -03:00
const soup = cheerio . load ( rawHtml ) ;
2024-04-15 17:01:47 -04:00
const metadata = extractMetadata ( soup , urlToScrap ) ;
2024-05-29 18:56:57 -04:00
2024-07-16 18:38:03 -07:00
let linksOnPage : string [ ] | undefined ;
2024-08-16 15:14:37 -03:00
if ( pageOptions . includeLinks ) {
linksOnPage = extractLinks ( rawHtml , urlToScrap ) ;
}
2024-07-16 18:38:03 -07:00
2024-05-29 18:56:57 -04:00
let document : Document ;
2024-05-31 15:39:54 -07:00
if ( screenshot && screenshot . length > 0 ) {
2024-05-29 18:56:57 -04:00
document = {
content : text ,
2024-08-29 20:08:06 -03:00
markdown : pageOptions.includeMarkdown || pageOptions . includeExtract ? text : undefined ,
2024-05-29 18:56:57 -04:00
html : pageOptions.includeHtml ? html : undefined ,
2024-07-03 18:01:17 -03:00
rawHtml :
pageOptions . includeRawHtml ||
2024-08-29 20:08:06 -03:00
( extractorOptions ? . mode === "llm-extraction-from-raw-html" && pageOptions . includeExtract )
2024-07-03 18:01:17 -03:00
? rawHtml
: undefined ,
2024-08-16 15:14:37 -03:00
linksOnPage : pageOptions.includeLinks ? linksOnPage : undefined ,
2024-05-31 15:39:54 -07:00
metadata : {
. . . metadata ,
screenshot : screenshot ,
sourceURL : urlToScrap ,
2024-06-13 17:08:40 -03:00
pageStatusCode : pageStatusCode ,
2024-06-28 15:51:18 -03:00
pageError : pageError ,
2024-05-31 15:39:54 -07:00
} ,
} ;
} else {
2024-05-29 18:56:57 -04:00
document = {
content : text ,
2024-08-29 20:08:06 -03:00
markdown : pageOptions.includeMarkdown || pageOptions . includeExtract ? text : undefined ,
2024-05-29 18:56:57 -04:00
html : pageOptions.includeHtml ? html : undefined ,
2024-07-03 18:01:17 -03:00
rawHtml :
pageOptions . includeRawHtml ||
2024-08-29 20:08:06 -03:00
( extractorOptions ? . mode === "llm-extraction-from-raw-html" && pageOptions . includeExtract )
2024-07-03 18:01:17 -03:00
? rawHtml
: undefined ,
2024-06-13 17:08:40 -03:00
metadata : {
. . . metadata ,
sourceURL : urlToScrap ,
pageStatusCode : pageStatusCode ,
2024-06-28 15:51:18 -03:00
pageError : pageError ,
2024-06-13 17:08:40 -03:00
} ,
2024-08-16 15:14:37 -03:00
linksOnPage : pageOptions.includeLinks ? linksOnPage : undefined ,
2024-05-31 15:39:54 -07:00
} ;
2024-05-29 18:56:57 -04:00
}
2024-05-09 17:45:16 -07:00
return document ;
2024-04-15 17:01:47 -04:00
} catch ( error ) {
2024-07-23 17:30:46 -03:00
Logger . debug ( ` ⛏️ Error: ${ error . message } - Failed to fetch URL: ${ urlToScrap } ` ) ;
2024-07-24 14:31:25 +02:00
ScrapeEvents . insert ( jobId , {
type : "error" ,
message : typeof error === "string" ? error : typeof error . message === "string" ? error.message : JSON.stringify ( error ) ,
stack : error.stack ,
} ) ;
2024-04-15 17:01:47 -04:00
return {
content : "" ,
2024-08-29 20:08:06 -03:00
markdown : pageOptions.includeMarkdown || pageOptions . includeExtract ? "" : undefined ,
2024-05-06 19:45:56 -03:00
html : "" ,
2024-08-16 15:14:37 -03:00
linksOnPage : pageOptions.includeLinks ? [ ] : undefined ,
2024-06-14 09:46:55 -03:00
metadata : {
sourceURL : urlToScrap ,
pageStatusCode : pageStatusCode ,
2024-06-28 15:51:18 -03:00
pageError : pageError ,
2024-06-14 09:46:55 -03:00
} ,
2024-04-15 17:01:47 -04:00
} as Document ;
}
}