feat(scrapeURL/fire-engine/chrome-cdp): handle file downloads

This commit is contained in:
Gergő Móricz 2024-12-26 20:29:09 +01:00
parent 37f258b73f
commit f15ef0e758
4 changed files with 41 additions and 24 deletions

View File

@ -3,7 +3,7 @@ import * as Sentry from "@sentry/node";
import { z } from "zod";
import { robustFetch } from "../../lib/fetch";
import { ActionError, EngineError, SiteError } from "../../error";
import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error";
const successSchema = z.object({
jobId: z.string(),
@ -35,6 +35,12 @@ const successSchema = z.object({
})
.array()
.optional(),
// chrome-cdp only -- file download handler
file: z.object({
name: z.string(),
content: z.string(),
}).optional().or(z.null()),
});
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
@ -111,6 +117,11 @@ export async function fireEngineCheckStatus(
status.error.includes("Chrome error: ")
) {
throw new SiteError(status.error.split("Chrome error: ")[1]);
} else if (
typeof status.error === "string" &&
status.error.includes("File size exceeds")
) {
throw new UnsupportedFileError("File size exceeds " + status.error.split("File size exceeds ")[1]);
} else if (
typeof status.error === "string" &&
// TODO: improve this later

View File

@ -13,7 +13,7 @@ import {
FireEngineCheckStatusSuccess,
StillProcessingError,
} from "./checkStatus";
import { ActionError, EngineError, SiteError, TimeoutError } from "../../error";
import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError } from "../../error";
import * as Sentry from "@sentry/node";
import { Action } from "../../../../lib/entities";
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
@ -71,7 +71,8 @@ async function performFireEngineScrape<
} else if (
error instanceof EngineError ||
error instanceof SiteError ||
error instanceof ActionError
error instanceof ActionError ||
error instanceof UnsupportedFileError
) {
logger.debug("Fire-engine scrape job failed.", {
error,
@ -91,6 +92,19 @@ async function performFireEngineScrape<
await new Promise((resolve) => setTimeout(resolve, 250));
}
specialtyScrapeCheck(
logger.child({
method: "performFireEngineScrape/specialtyScrapeCheck",
}),
status.responseHeaders,
);
if (status.file) {
const content = status.file.content;
delete status.file;
status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag
}
return status;
}
@ -160,13 +174,6 @@ export async function scrapeURLWithFireEngineChromeCDP(
timeout,
);
specialtyScrapeCheck(
meta.logger.child({
method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck",
}),
response.responseHeaders,
);
if (
meta.options.formats.includes("screenshot") ||
meta.options.formats.includes("screenshot@fullPage")
@ -241,13 +248,6 @@ export async function scrapeURLWithFireEnginePlaywright(
timeout,
);
specialtyScrapeCheck(
meta.logger.child({
method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck",
}),
response.responseHeaders,
);
if (!response.url) {
meta.logger.warn("Fire-engine did not return the response's URL", {
response,
@ -301,13 +301,6 @@ export async function scrapeURLWithFireEngineTLSClient(
timeout,
);
specialtyScrapeCheck(
meta.logger.child({
method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck",
}),
response.responseHeaders,
);
if (!response.url) {
meta.logger.warn("Fire-engine did not return the response's URL", {
response,

View File

@ -64,3 +64,11 @@ export class ActionError extends Error {
this.code = code;
}
}
export class UnsupportedFileError extends Error {
public reason: string;
constructor(reason: string) {
super("Scrape resulted in unsupported file: " + reason);
this.reason = reason;
}
}

View File

@ -19,6 +19,7 @@ import {
RemoveFeatureError,
SiteError,
TimeoutError,
UnsupportedFileError,
} from "./error";
import { executeTransformers } from "./transformers";
import { LLMRefusalError } from "./transformers/llmExtract";
@ -292,6 +293,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
throw error;
} else if (error instanceof ActionError) {
throw error;
} else if (error instanceof UnsupportedFileError) {
throw error;
} else {
Sentry.captureException(error);
meta.logger.info(
@ -414,6 +417,8 @@ export async function scrapeURL(
meta.logger.warn("scrapeURL: Site failed to load in browser", { error });
} else if (error instanceof ActionError) {
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
} else if (error instanceof UnsupportedFileError) {
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { error });
} else {
Sentry.captureException(error);
meta.logger.error("scrapeURL: Unexpected error happened", { error });