From f15ef0e7582e39e6ed17ee9f01e10969f02b4c93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 26 Dec 2024 20:29:09 +0100 Subject: [PATCH] feat(scrapeURL/fire-engine/chrome-cdp): handle file downloads --- .../engines/fire-engine/checkStatus.ts | 13 ++++++- .../scrapeURL/engines/fire-engine/index.ts | 39 ++++++++----------- apps/api/src/scraper/scrapeURL/error.ts | 8 ++++ apps/api/src/scraper/scrapeURL/index.ts | 5 +++ 4 files changed, 41 insertions(+), 24 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index 6f65db98..e02e9dbb 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -3,7 +3,7 @@ import * as Sentry from "@sentry/node"; import { z } from "zod"; import { robustFetch } from "../../lib/fetch"; -import { ActionError, EngineError, SiteError } from "../../error"; +import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error"; const successSchema = z.object({ jobId: z.string(), @@ -35,6 +35,12 @@ const successSchema = z.object({ }) .array() .optional(), + + // chrome-cdp only -- file download handler + file: z.object({ + name: z.string(), + content: z.string(), + }).optional().or(z.null()), }); export type FireEngineCheckStatusSuccess = z.infer; @@ -111,6 +117,11 @@ export async function fireEngineCheckStatus( status.error.includes("Chrome error: ") ) { throw new SiteError(status.error.split("Chrome error: ")[1]); + } else if ( + typeof status.error === "string" && + status.error.includes("File size exceeds") + ) { + throw new UnsupportedFileError("File size exceeds " + status.error.split("File size exceeds ")[1]); } else if ( typeof status.error === "string" && // TODO: improve this later diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index d753465d..aa869836 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -13,7 +13,7 @@ import { FireEngineCheckStatusSuccess, StillProcessingError, } from "./checkStatus"; -import { ActionError, EngineError, SiteError, TimeoutError } from "../../error"; +import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError } from "../../error"; import * as Sentry from "@sentry/node"; import { Action } from "../../../../lib/entities"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; @@ -71,7 +71,8 @@ async function performFireEngineScrape< } else if ( error instanceof EngineError || error instanceof SiteError || - error instanceof ActionError + error instanceof ActionError || + error instanceof UnsupportedFileError ) { logger.debug("Fire-engine scrape job failed.", { error, @@ -91,6 +92,19 @@ async function performFireEngineScrape< await new Promise((resolve) => setTimeout(resolve, 250)); } + specialtyScrapeCheck( + logger.child({ + method: "performFireEngineScrape/specialtyScrapeCheck", + }), + status.responseHeaders, + ); + + if (status.file) { + const content = status.file.content; + delete status.file; + status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag + } + return status; } @@ -160,13 +174,6 @@ export async function scrapeURLWithFireEngineChromeCDP( timeout, ); - specialtyScrapeCheck( - meta.logger.child({ - method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck", - }), - response.responseHeaders, - ); - if ( meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage") @@ -241,13 +248,6 @@ export async function scrapeURLWithFireEnginePlaywright( timeout, ); - specialtyScrapeCheck( - meta.logger.child({ - method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck", - }), - response.responseHeaders, - ); - if (!response.url) { meta.logger.warn("Fire-engine did not return the response's URL", { response, @@ -301,13 +301,6 @@ export async function scrapeURLWithFireEngineTLSClient( timeout, ); - specialtyScrapeCheck( - meta.logger.child({ - method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck", - }), - response.responseHeaders, - ); - if (!response.url) { meta.logger.warn("Fire-engine did not return the response's URL", { response, diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index 689f90c8..bff3a492 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -64,3 +64,11 @@ export class ActionError extends Error { this.code = code; } } + +export class UnsupportedFileError extends Error { + public reason: string; + constructor(reason: string) { + super("Scrape resulted in unsupported file: " + reason); + this.reason = reason; + } +} diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 1df812bd..130ef9ee 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -19,6 +19,7 @@ import { RemoveFeatureError, SiteError, TimeoutError, + UnsupportedFileError, } from "./error"; import { executeTransformers } from "./transformers"; import { LLMRefusalError } from "./transformers/llmExtract"; @@ -292,6 +293,8 @@ async function scrapeURLLoop(meta: Meta): Promise { throw error; } else if (error instanceof ActionError) { throw error; + } else if (error instanceof UnsupportedFileError) { + throw error; } else { Sentry.captureException(error); meta.logger.info( @@ -414,6 +417,8 @@ export async function scrapeURL( meta.logger.warn("scrapeURL: Site failed to load in browser", { error }); } else if (error instanceof ActionError) { meta.logger.warn("scrapeURL: Action(s) failed to complete", { error }); + } else if (error instanceof UnsupportedFileError) { + meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { error }); } else { Sentry.captureException(error); meta.logger.error("scrapeURL: Unexpected error happened", { error });