2024-11-07 20:57:33 +01:00
|
|
|
import { Logger } from "winston";
|
|
|
|
import { Meta } from "../..";
|
2024-12-11 19:46:11 -03:00
|
|
|
import {
|
|
|
|
fireEngineScrape,
|
|
|
|
FireEngineScrapeRequestChromeCDP,
|
|
|
|
FireEngineScrapeRequestCommon,
|
|
|
|
FireEngineScrapeRequestPlaywright,
|
2024-12-11 19:51:08 -03:00
|
|
|
FireEngineScrapeRequestTLSClient,
|
2024-12-11 19:46:11 -03:00
|
|
|
} from "./scrape";
|
2024-11-07 20:57:33 +01:00
|
|
|
import { EngineScrapeResult } from "..";
|
2024-12-11 19:46:11 -03:00
|
|
|
import {
|
|
|
|
fireEngineCheckStatus,
|
|
|
|
FireEngineCheckStatusSuccess,
|
2024-12-11 19:51:08 -03:00
|
|
|
StillProcessingError,
|
2024-12-11 19:46:11 -03:00
|
|
|
} from "./checkStatus";
|
2024-12-15 15:43:12 -03:00
|
|
|
import { ActionError, EngineError, SiteError, TimeoutError } from "../../error";
|
2024-11-07 20:57:33 +01:00
|
|
|
import * as Sentry from "@sentry/node";
|
|
|
|
import { Action } from "../../../../lib/entities";
|
|
|
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
|
|
|
|
|
|
|
// This function does not take `Meta` on purpose. It may not access any
|
|
|
|
// meta values to construct the request -- that must be done by the
|
|
|
|
// `scrapeURLWithFireEngine*` functions.
|
2024-12-11 19:46:11 -03:00
|
|
|
async function performFireEngineScrape<
|
|
|
|
Engine extends
|
|
|
|
| FireEngineScrapeRequestChromeCDP
|
|
|
|
| FireEngineScrapeRequestPlaywright
|
2024-12-11 19:51:08 -03:00
|
|
|
| FireEngineScrapeRequestTLSClient,
|
2024-12-11 19:46:11 -03:00
|
|
|
>(
|
|
|
|
logger: Logger,
|
|
|
|
request: FireEngineScrapeRequestCommon & Engine,
|
2024-12-15 18:58:29 +01:00
|
|
|
timeout: number,
|
2024-11-07 20:57:33 +01:00
|
|
|
): Promise<FireEngineCheckStatusSuccess> {
|
2024-12-11 19:46:11 -03:00
|
|
|
const scrape = await fireEngineScrape(
|
|
|
|
logger.child({ method: "fireEngineScrape" }),
|
2024-12-11 19:51:08 -03:00
|
|
|
request,
|
2024-12-11 19:46:11 -03:00
|
|
|
);
|
|
|
|
|
|
|
|
const startTime = Date.now();
|
|
|
|
const errorLimit = 3;
|
|
|
|
let errors: any[] = [];
|
|
|
|
let status: FireEngineCheckStatusSuccess | undefined = undefined;
|
|
|
|
|
|
|
|
while (status === undefined) {
|
|
|
|
if (errors.length >= errorLimit) {
|
|
|
|
logger.error("Error limit hit.", { errors });
|
|
|
|
throw new Error("Error limit hit. See e.cause.errors for errors.", {
|
2024-12-11 19:51:08 -03:00
|
|
|
cause: { errors },
|
2024-12-11 19:46:11 -03:00
|
|
|
});
|
|
|
|
}
|
2024-11-07 20:57:33 +01:00
|
|
|
|
2024-12-15 18:58:29 +01:00
|
|
|
if (Date.now() - startTime > timeout) {
|
2024-12-11 19:46:11 -03:00
|
|
|
logger.info(
|
|
|
|
"Fire-engine was unable to scrape the page before timing out.",
|
2024-12-11 19:51:08 -03:00
|
|
|
{ errors, timeout },
|
2024-12-11 19:46:11 -03:00
|
|
|
);
|
|
|
|
throw new TimeoutError(
|
|
|
|
"Fire-engine was unable to scrape the page before timing out",
|
2024-12-11 19:51:08 -03:00
|
|
|
{ cause: { errors, timeout } },
|
2024-12-11 19:46:11 -03:00
|
|
|
);
|
|
|
|
}
|
2024-11-07 20:57:33 +01:00
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
try {
|
|
|
|
status = await fireEngineCheckStatus(
|
|
|
|
logger.child({ method: "fireEngineCheckStatus" }),
|
2024-12-11 19:51:08 -03:00
|
|
|
scrape.jobId,
|
2024-12-11 19:46:11 -03:00
|
|
|
);
|
|
|
|
} catch (error) {
|
|
|
|
if (error instanceof StillProcessingError) {
|
|
|
|
// nop
|
2024-12-15 15:43:12 -03:00
|
|
|
} else if (
|
|
|
|
error instanceof EngineError ||
|
|
|
|
error instanceof SiteError ||
|
|
|
|
error instanceof ActionError
|
|
|
|
) {
|
2024-12-11 19:46:11 -03:00
|
|
|
logger.debug("Fire-engine scrape job failed.", {
|
|
|
|
error,
|
2024-12-11 19:51:08 -03:00
|
|
|
jobId: scrape.jobId,
|
2024-12-11 19:46:11 -03:00
|
|
|
});
|
|
|
|
throw error;
|
|
|
|
} else {
|
|
|
|
Sentry.captureException(error);
|
|
|
|
errors.push(error);
|
|
|
|
logger.debug(
|
|
|
|
`An unexpeceted error occurred while calling checkStatus. Error counter is now at ${errors.length}.`,
|
2024-12-11 19:51:08 -03:00
|
|
|
{ error, jobId: scrape.jobId },
|
2024-12-11 19:46:11 -03:00
|
|
|
);
|
|
|
|
}
|
2024-11-07 20:57:33 +01:00
|
|
|
}
|
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
await new Promise((resolve) => setTimeout(resolve, 250));
|
|
|
|
}
|
|
|
|
|
|
|
|
return status;
|
2024-11-07 20:57:33 +01:00
|
|
|
}
|
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
export async function scrapeURLWithFireEngineChromeCDP(
|
2024-12-11 19:51:08 -03:00
|
|
|
meta: Meta,
|
2024-12-15 18:58:29 +01:00
|
|
|
timeToRun: number | undefined,
|
2024-12-11 19:46:11 -03:00
|
|
|
): Promise<EngineScrapeResult> {
|
|
|
|
const actions: Action[] = [
|
|
|
|
// Transform waitFor option into an action (unsupported by chrome-cdp)
|
|
|
|
...(meta.options.waitFor !== 0
|
|
|
|
? [
|
|
|
|
{
|
2024-11-07 20:57:33 +01:00
|
|
|
type: "wait" as const,
|
2024-12-11 19:51:08 -03:00
|
|
|
milliseconds: meta.options.waitFor,
|
|
|
|
},
|
2024-12-11 19:46:11 -03:00
|
|
|
]
|
|
|
|
: []),
|
|
|
|
|
|
|
|
// Transform screenshot format into an action (unsupported by chrome-cdp)
|
|
|
|
...(meta.options.formats.includes("screenshot") ||
|
|
|
|
meta.options.formats.includes("screenshot@fullPage")
|
|
|
|
? [
|
|
|
|
{
|
2024-11-07 20:57:33 +01:00
|
|
|
type: "screenshot" as const,
|
2024-12-11 19:51:08 -03:00
|
|
|
fullPage: meta.options.formats.includes("screenshot@fullPage"),
|
|
|
|
},
|
2024-12-11 19:46:11 -03:00
|
|
|
]
|
|
|
|
: []),
|
|
|
|
|
|
|
|
// Include specified actions
|
2024-12-11 19:51:08 -03:00
|
|
|
...(meta.options.actions ?? []),
|
2024-12-11 19:46:11 -03:00
|
|
|
];
|
2024-12-17 13:17:55 +01:00
|
|
|
|
|
|
|
const totalWait = actions.reduce(
|
|
|
|
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 1000) + a : a),
|
|
|
|
0,
|
|
|
|
);
|
2024-12-15 16:04:17 +01:00
|
|
|
|
2024-12-17 13:17:55 +01:00
|
|
|
const timeout = (timeToRun ?? 300000) + totalWait;
|
2024-12-11 19:46:11 -03:00
|
|
|
|
|
|
|
const request: FireEngineScrapeRequestCommon &
|
|
|
|
FireEngineScrapeRequestChromeCDP = {
|
|
|
|
url: meta.url,
|
|
|
|
engine: "chrome-cdp",
|
|
|
|
instantReturn: true,
|
|
|
|
skipTlsVerification: meta.options.skipTlsVerification,
|
|
|
|
headers: meta.options.headers,
|
|
|
|
...(actions.length > 0
|
|
|
|
? {
|
2024-12-11 19:51:08 -03:00
|
|
|
actions,
|
2024-12-11 19:46:11 -03:00
|
|
|
}
|
|
|
|
: {}),
|
|
|
|
priority: meta.internalOptions.priority,
|
|
|
|
geolocation: meta.options.geolocation,
|
|
|
|
mobile: meta.options.mobile,
|
2024-12-15 16:04:17 +01:00
|
|
|
timeout, // TODO: better timeout logic
|
2024-12-11 19:51:08 -03:00
|
|
|
disableSmartWaitCache: meta.internalOptions.disableSmartWaitCache,
|
2024-12-11 19:46:11 -03:00
|
|
|
// TODO: scrollXPaths
|
|
|
|
};
|
|
|
|
|
|
|
|
let response = await performFireEngineScrape(
|
|
|
|
meta.logger.child({
|
|
|
|
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
2024-12-11 19:51:08 -03:00
|
|
|
request,
|
2024-12-11 19:46:11 -03:00
|
|
|
}),
|
|
|
|
request,
|
2024-12-17 13:17:55 +01:00
|
|
|
timeout,
|
2024-12-11 19:46:11 -03:00
|
|
|
);
|
|
|
|
|
|
|
|
specialtyScrapeCheck(
|
|
|
|
meta.logger.child({
|
2024-12-11 19:51:08 -03:00
|
|
|
method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck",
|
2024-12-11 19:46:11 -03:00
|
|
|
}),
|
2024-12-11 19:51:08 -03:00
|
|
|
response.responseHeaders,
|
2024-12-11 19:46:11 -03:00
|
|
|
);
|
|
|
|
|
|
|
|
if (
|
|
|
|
meta.options.formats.includes("screenshot") ||
|
|
|
|
meta.options.formats.includes("screenshot@fullPage")
|
|
|
|
) {
|
|
|
|
meta.logger.debug(
|
|
|
|
"Transforming screenshots from actions into screenshot field",
|
2024-12-11 19:51:08 -03:00
|
|
|
{ screenshots: response.screenshots },
|
2024-11-07 20:57:33 +01:00
|
|
|
);
|
2024-12-11 19:46:11 -03:00
|
|
|
response.screenshot = (response.screenshots ?? [])[0];
|
|
|
|
(response.screenshots ?? []).splice(0, 1);
|
|
|
|
meta.logger.debug("Screenshot transformation done", {
|
|
|
|
screenshots: response.screenshots,
|
2024-12-11 19:51:08 -03:00
|
|
|
screenshot: response.screenshot,
|
2024-12-11 19:46:11 -03:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!response.url) {
|
|
|
|
meta.logger.warn("Fire-engine did not return the response's URL", {
|
|
|
|
response,
|
2024-12-11 19:51:08 -03:00
|
|
|
sourceURL: meta.url,
|
2024-12-11 19:46:11 -03:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
return {
|
|
|
|
url: response.url ?? meta.url,
|
|
|
|
|
|
|
|
html: response.content,
|
|
|
|
error: response.pageError,
|
|
|
|
statusCode: response.pageStatusCode,
|
|
|
|
|
|
|
|
screenshot: response.screenshot,
|
|
|
|
...(actions.length > 0
|
|
|
|
? {
|
|
|
|
actions: {
|
|
|
|
screenshots: response.screenshots ?? [],
|
2024-12-11 19:51:08 -03:00
|
|
|
scrapes: response.actionContent ?? [],
|
|
|
|
},
|
2024-12-11 19:46:11 -03:00
|
|
|
}
|
2024-12-11 19:51:08 -03:00
|
|
|
: {}),
|
2024-12-11 19:46:11 -03:00
|
|
|
};
|
2024-11-07 20:57:33 +01:00
|
|
|
}
|
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
export async function scrapeURLWithFireEnginePlaywright(
|
2024-12-11 19:51:08 -03:00
|
|
|
meta: Meta,
|
2024-12-15 18:58:29 +01:00
|
|
|
timeToRun: number | undefined,
|
2024-12-11 19:46:11 -03:00
|
|
|
): Promise<EngineScrapeResult> {
|
2024-12-17 13:17:55 +01:00
|
|
|
const totalWait = meta.options.waitFor;
|
|
|
|
const timeout = (timeToRun ?? 300000) + totalWait;
|
2024-12-15 16:04:17 +01:00
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
const request: FireEngineScrapeRequestCommon &
|
|
|
|
FireEngineScrapeRequestPlaywright = {
|
|
|
|
url: meta.url,
|
|
|
|
engine: "playwright",
|
|
|
|
instantReturn: true,
|
|
|
|
|
|
|
|
headers: meta.options.headers,
|
|
|
|
priority: meta.internalOptions.priority,
|
|
|
|
screenshot: meta.options.formats.includes("screenshot"),
|
|
|
|
fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
|
|
|
|
wait: meta.options.waitFor,
|
|
|
|
geolocation: meta.options.geolocation,
|
|
|
|
|
2024-12-15 16:04:17 +01:00
|
|
|
timeout,
|
2024-12-11 19:46:11 -03:00
|
|
|
};
|
|
|
|
|
|
|
|
let response = await performFireEngineScrape(
|
|
|
|
meta.logger.child({
|
|
|
|
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
2024-12-11 19:51:08 -03:00
|
|
|
request,
|
2024-12-11 19:46:11 -03:00
|
|
|
}),
|
|
|
|
request,
|
2024-12-17 13:17:55 +01:00
|
|
|
timeout,
|
2024-12-11 19:46:11 -03:00
|
|
|
);
|
|
|
|
|
|
|
|
specialtyScrapeCheck(
|
|
|
|
meta.logger.child({
|
2024-12-11 19:51:08 -03:00
|
|
|
method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck",
|
2024-12-11 19:46:11 -03:00
|
|
|
}),
|
2024-12-11 19:51:08 -03:00
|
|
|
response.responseHeaders,
|
2024-12-11 19:46:11 -03:00
|
|
|
);
|
|
|
|
|
|
|
|
if (!response.url) {
|
|
|
|
meta.logger.warn("Fire-engine did not return the response's URL", {
|
|
|
|
response,
|
2024-12-11 19:51:08 -03:00
|
|
|
sourceURL: meta.url,
|
2024-12-11 19:46:11 -03:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
return {
|
|
|
|
url: response.url ?? meta.url,
|
|
|
|
|
|
|
|
html: response.content,
|
|
|
|
error: response.pageError,
|
|
|
|
statusCode: response.pageStatusCode,
|
|
|
|
|
|
|
|
...(response.screenshots !== undefined && response.screenshots.length > 0
|
|
|
|
? {
|
2024-12-11 19:51:08 -03:00
|
|
|
screenshot: response.screenshots[0],
|
2024-12-11 19:46:11 -03:00
|
|
|
}
|
2024-12-11 19:51:08 -03:00
|
|
|
: {}),
|
2024-12-11 19:46:11 -03:00
|
|
|
};
|
2024-11-07 20:57:33 +01:00
|
|
|
}
|
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
export async function scrapeURLWithFireEngineTLSClient(
|
2024-12-11 19:51:08 -03:00
|
|
|
meta: Meta,
|
2024-12-15 18:58:29 +01:00
|
|
|
timeToRun: number | undefined,
|
2024-12-11 19:46:11 -03:00
|
|
|
): Promise<EngineScrapeResult> {
|
2024-12-15 18:58:29 +01:00
|
|
|
const timeout = timeToRun ?? 30000;
|
2024-12-15 16:04:17 +01:00
|
|
|
|
2024-12-11 19:46:11 -03:00
|
|
|
const request: FireEngineScrapeRequestCommon &
|
|
|
|
FireEngineScrapeRequestTLSClient = {
|
|
|
|
url: meta.url,
|
|
|
|
engine: "tlsclient",
|
|
|
|
instantReturn: true,
|
|
|
|
|
|
|
|
headers: meta.options.headers,
|
|
|
|
priority: meta.internalOptions.priority,
|
|
|
|
|
|
|
|
atsv: meta.internalOptions.atsv,
|
|
|
|
geolocation: meta.options.geolocation,
|
|
|
|
disableJsDom: meta.internalOptions.v0DisableJsDom,
|
|
|
|
|
2024-12-15 16:04:17 +01:00
|
|
|
timeout,
|
2024-12-11 19:46:11 -03:00
|
|
|
};
|
|
|
|
|
|
|
|
let response = await performFireEngineScrape(
|
|
|
|
meta.logger.child({
|
|
|
|
method: "scrapeURLWithFireEngineChromeCDP/callFireEngine",
|
2024-12-11 19:51:08 -03:00
|
|
|
request,
|
2024-12-11 19:46:11 -03:00
|
|
|
}),
|
|
|
|
request,
|
2024-12-15 16:04:17 +01:00
|
|
|
timeout,
|
2024-12-11 19:46:11 -03:00
|
|
|
);
|
|
|
|
|
|
|
|
specialtyScrapeCheck(
|
|
|
|
meta.logger.child({
|
2024-12-11 19:51:08 -03:00
|
|
|
method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck",
|
2024-12-11 19:46:11 -03:00
|
|
|
}),
|
2024-12-11 19:51:08 -03:00
|
|
|
response.responseHeaders,
|
2024-12-11 19:46:11 -03:00
|
|
|
);
|
|
|
|
|
|
|
|
if (!response.url) {
|
|
|
|
meta.logger.warn("Fire-engine did not return the response's URL", {
|
|
|
|
response,
|
2024-12-11 19:51:08 -03:00
|
|
|
sourceURL: meta.url,
|
2024-12-11 19:46:11 -03:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
return {
|
|
|
|
url: response.url ?? meta.url,
|
|
|
|
|
|
|
|
html: response.content,
|
|
|
|
error: response.pageError,
|
2024-12-11 19:51:08 -03:00
|
|
|
statusCode: response.pageStatusCode,
|
2024-12-11 19:46:11 -03:00
|
|
|
};
|
2024-11-07 20:57:33 +01:00
|
|
|
}
|