2024-07-03 18:01:17 -03:00
import axios from "axios" ;
2024-07-12 22:02:08 -04:00
import { FireEngineOptions , FireEngineResponse } from "../../../lib/entities" ;
2024-07-03 18:01:17 -03:00
import { logScrape } from "../../../services/logging/scrape_log" ;
import { generateRequestParams } from "../single_url" ;
import { fetchAndProcessPdf } from "../utils/pdfProcessor" ;
import { universalTimeout } from "../global" ;
2024-07-03 18:01:54 -03:00
/ * *
* Scrapes a URL with Fire - Engine
* @param url The URL to scrape
* @param waitFor The time to wait for the page to load
* @param screenshot Whether to take a screenshot
* @param pageOptions The options for the page
* @param headers The headers to send with the request
* @param options The options for the request
* @returns The scraped content
* /
2024-07-03 18:01:17 -03:00
export async function scrapWithFireEngine ( {
url ,
waitFor = 0 ,
screenshot = false ,
pageOptions = { parsePDF : true } ,
2024-07-12 22:02:08 -04:00
fireEngineOptions = { } ,
2024-07-03 18:01:17 -03:00
headers ,
options ,
} : {
url : string ;
waitFor? : number ;
screenshot? : boolean ;
pageOptions ? : { scrollXPaths? : string [ ] ; parsePDF? : boolean } ;
2024-07-12 22:02:08 -04:00
fireEngineOptions? : FireEngineOptions ;
2024-07-03 18:01:17 -03:00
headers? : Record < string , string > ;
options? : any ;
} ) : Promise < FireEngineResponse > {
const logParams = {
url ,
scraper : "fire-engine" ,
success : false ,
response_code : null ,
time_taken_seconds : null ,
error_message : null ,
html : "" ,
startTime : Date.now ( ) ,
} ;
try {
const reqParams = await generateRequestParams ( url ) ;
const waitParam = reqParams [ "params" ] ? . wait ? ? waitFor ;
2024-07-18 13:19:44 -04:00
const engineParam = reqParams [ "params" ] ? . engine ? ? fireEngineOptions ? . engine ? ? "playwright" ;
2024-07-03 18:01:17 -03:00
const screenshotParam = reqParams [ "params" ] ? . screenshot ? ? screenshot ;
2024-07-12 23:20:26 -04:00
const fireEngineOptionsParam : FireEngineOptions = reqParams [ "params" ] ? . fireEngineOptions ? ? fireEngineOptions ;
2024-07-15 18:40:43 +03:00
let endpoint = "/scrape" ;
if ( options ? . endpoint === "request" ) {
endpoint = "/request" ;
}
2024-07-18 13:19:44 -04:00
let engine = engineParam ; // do we want fireEngineOptions as first choice?
2024-07-12 23:20:26 -04:00
2024-07-03 18:01:17 -03:00
console . log (
2024-07-18 13:19:44 -04:00
` [Fire-Engine][ ${ engine } ] Scraping ${ url } with wait: ${ waitParam } and screenshot: ${ screenshotParam } and method: ${ fireEngineOptionsParam ? . method ? ? "null" } `
2024-07-03 18:01:17 -03:00
) ;
2024-07-18 13:19:44 -04:00
// console.log(fireEngineOptionsParam)
2024-07-12 23:20:26 -04:00
2024-07-03 18:01:17 -03:00
const response = await axios . post (
2024-07-12 23:20:26 -04:00
process . env . FIRE_ENGINE_BETA_URL + endpoint ,
2024-07-03 18:01:17 -03:00
{
url : url ,
wait : waitParam ,
screenshot : screenshotParam ,
headers : headers ,
pageOptions : pageOptions ,
2024-07-12 22:15:00 -04:00
. . . fireEngineOptionsParam ,
2024-07-03 18:01:17 -03:00
} ,
{
headers : {
"Content-Type" : "application/json" ,
} ,
timeout : universalTimeout + waitParam ,
}
) ;
if ( response . status !== 200 ) {
console . error (
2024-07-18 13:19:44 -04:00
` [Fire-Engine][ ${ engine } ] Error fetching url: ${ url } with status: ${ response . status } `
2024-07-03 18:01:17 -03:00
) ;
2024-07-12 23:20:26 -04:00
2024-07-03 18:01:17 -03:00
logParams . error_message = response . data ? . pageError ;
logParams . response_code = response . data ? . pageStatusCode ;
2024-07-12 14:59:49 -04:00
if ( response . data && response . data ? . pageStatusCode !== 200 ) {
2024-07-18 13:19:44 -04:00
console . error ( ` [Fire-Engine][ ${ engine } ] Error fetching url: ${ url } with status: ${ response . status } ` ) ;
2024-07-12 14:59:49 -04:00
}
2024-07-03 18:01:17 -03:00
return {
html : "" ,
screenshot : "" ,
pageStatusCode : response.data?.pageStatusCode ,
pageError : response.data?.pageError ,
} ;
}
const contentType = response . headers [ "content-type" ] ;
if ( contentType && contentType . includes ( "application/pdf" ) ) {
const { content , pageStatusCode , pageError } = await fetchAndProcessPdf (
url ,
pageOptions ? . parsePDF
) ;
logParams . success = true ;
2024-07-03 18:06:53 -03:00
logParams . response_code = pageStatusCode ;
logParams . error_message = pageError ;
2024-07-03 18:01:17 -03:00
return { html : content , screenshot : "" , pageStatusCode , pageError } ;
} else {
const data = response . data ;
logParams . success =
( data . pageStatusCode >= 200 && data . pageStatusCode < 300 ) ||
data . pageStatusCode === 404 ;
logParams . html = data . content ? ? "" ;
logParams . response_code = data . pageStatusCode ;
logParams . error_message = data . pageError ;
return {
html : data.content ? ? "" ,
screenshot : data.screenshot ? ? "" ,
pageStatusCode : data.pageStatusCode ,
pageError : data.pageError ,
} ;
}
} catch ( error ) {
if ( error . code === "ECONNABORTED" ) {
console . log ( ` [Fire-Engine] Request timed out for ${ url } ` ) ;
logParams . error_message = "Request timed out" ;
} else {
console . error ( ` [Fire-Engine][c] Error fetching url: ${ url } -> ${ error } ` ) ;
logParams . error_message = error . message || error ;
}
2024-07-03 18:06:53 -03:00
return { html : "" , screenshot : "" , pageStatusCode : null , pageError : logParams.error_message } ;
2024-07-03 18:01:17 -03:00
} finally {
const endTime = Date . now ( ) ;
2024-07-03 18:06:53 -03:00
logParams . time_taken_seconds = ( endTime - logParams . startTime ) / 1000 ;
2024-07-03 20:18:11 -03:00
await logScrape ( logParams , pageOptions ) ;
2024-07-03 18:01:17 -03:00
}
}
2024-07-03 18:06:53 -03:00