mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-09-26 17:01:27 +00:00
feat(scrapeURL/fire-engine/chrome-cdp): handle file downloads
This commit is contained in:
parent
37f258b73f
commit
f15ef0e758
@ -3,7 +3,7 @@ import * as Sentry from "@sentry/node";
|
|||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
|
|
||||||
import { robustFetch } from "../../lib/fetch";
|
import { robustFetch } from "../../lib/fetch";
|
||||||
import { ActionError, EngineError, SiteError } from "../../error";
|
import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error";
|
||||||
|
|
||||||
const successSchema = z.object({
|
const successSchema = z.object({
|
||||||
jobId: z.string(),
|
jobId: z.string(),
|
||||||
@ -35,6 +35,12 @@ const successSchema = z.object({
|
|||||||
})
|
})
|
||||||
.array()
|
.array()
|
||||||
.optional(),
|
.optional(),
|
||||||
|
|
||||||
|
// chrome-cdp only -- file download handler
|
||||||
|
file: z.object({
|
||||||
|
name: z.string(),
|
||||||
|
content: z.string(),
|
||||||
|
}).optional().or(z.null()),
|
||||||
});
|
});
|
||||||
|
|
||||||
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
||||||
@ -111,6 +117,11 @@ export async function fireEngineCheckStatus(
|
|||||||
status.error.includes("Chrome error: ")
|
status.error.includes("Chrome error: ")
|
||||||
) {
|
) {
|
||||||
throw new SiteError(status.error.split("Chrome error: ")[1]);
|
throw new SiteError(status.error.split("Chrome error: ")[1]);
|
||||||
|
} else if (
|
||||||
|
typeof status.error === "string" &&
|
||||||
|
status.error.includes("File size exceeds")
|
||||||
|
) {
|
||||||
|
throw new UnsupportedFileError("File size exceeds " + status.error.split("File size exceeds ")[1]);
|
||||||
} else if (
|
} else if (
|
||||||
typeof status.error === "string" &&
|
typeof status.error === "string" &&
|
||||||
// TODO: improve this later
|
// TODO: improve this later
|
||||||
|
@ -13,7 +13,7 @@ import {
|
|||||||
FireEngineCheckStatusSuccess,
|
FireEngineCheckStatusSuccess,
|
||||||
StillProcessingError,
|
StillProcessingError,
|
||||||
} from "./checkStatus";
|
} from "./checkStatus";
|
||||||
import { ActionError, EngineError, SiteError, TimeoutError } from "../../error";
|
import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError } from "../../error";
|
||||||
import * as Sentry from "@sentry/node";
|
import * as Sentry from "@sentry/node";
|
||||||
import { Action } from "../../../../lib/entities";
|
import { Action } from "../../../../lib/entities";
|
||||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||||
@ -71,7 +71,8 @@ async function performFireEngineScrape<
|
|||||||
} else if (
|
} else if (
|
||||||
error instanceof EngineError ||
|
error instanceof EngineError ||
|
||||||
error instanceof SiteError ||
|
error instanceof SiteError ||
|
||||||
error instanceof ActionError
|
error instanceof ActionError ||
|
||||||
|
error instanceof UnsupportedFileError
|
||||||
) {
|
) {
|
||||||
logger.debug("Fire-engine scrape job failed.", {
|
logger.debug("Fire-engine scrape job failed.", {
|
||||||
error,
|
error,
|
||||||
@ -91,6 +92,19 @@ async function performFireEngineScrape<
|
|||||||
await new Promise((resolve) => setTimeout(resolve, 250));
|
await new Promise((resolve) => setTimeout(resolve, 250));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
specialtyScrapeCheck(
|
||||||
|
logger.child({
|
||||||
|
method: "performFireEngineScrape/specialtyScrapeCheck",
|
||||||
|
}),
|
||||||
|
status.responseHeaders,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (status.file) {
|
||||||
|
const content = status.file.content;
|
||||||
|
delete status.file;
|
||||||
|
status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag
|
||||||
|
}
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -160,13 +174,6 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
timeout,
|
timeout,
|
||||||
);
|
);
|
||||||
|
|
||||||
specialtyScrapeCheck(
|
|
||||||
meta.logger.child({
|
|
||||||
method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck",
|
|
||||||
}),
|
|
||||||
response.responseHeaders,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
meta.options.formats.includes("screenshot") ||
|
meta.options.formats.includes("screenshot") ||
|
||||||
meta.options.formats.includes("screenshot@fullPage")
|
meta.options.formats.includes("screenshot@fullPage")
|
||||||
@ -241,13 +248,6 @@ export async function scrapeURLWithFireEnginePlaywright(
|
|||||||
timeout,
|
timeout,
|
||||||
);
|
);
|
||||||
|
|
||||||
specialtyScrapeCheck(
|
|
||||||
meta.logger.child({
|
|
||||||
method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck",
|
|
||||||
}),
|
|
||||||
response.responseHeaders,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!response.url) {
|
if (!response.url) {
|
||||||
meta.logger.warn("Fire-engine did not return the response's URL", {
|
meta.logger.warn("Fire-engine did not return the response's URL", {
|
||||||
response,
|
response,
|
||||||
@ -301,13 +301,6 @@ export async function scrapeURLWithFireEngineTLSClient(
|
|||||||
timeout,
|
timeout,
|
||||||
);
|
);
|
||||||
|
|
||||||
specialtyScrapeCheck(
|
|
||||||
meta.logger.child({
|
|
||||||
method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck",
|
|
||||||
}),
|
|
||||||
response.responseHeaders,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!response.url) {
|
if (!response.url) {
|
||||||
meta.logger.warn("Fire-engine did not return the response's URL", {
|
meta.logger.warn("Fire-engine did not return the response's URL", {
|
||||||
response,
|
response,
|
||||||
|
@ -64,3 +64,11 @@ export class ActionError extends Error {
|
|||||||
this.code = code;
|
this.code = code;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export class UnsupportedFileError extends Error {
|
||||||
|
public reason: string;
|
||||||
|
constructor(reason: string) {
|
||||||
|
super("Scrape resulted in unsupported file: " + reason);
|
||||||
|
this.reason = reason;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -19,6 +19,7 @@ import {
|
|||||||
RemoveFeatureError,
|
RemoveFeatureError,
|
||||||
SiteError,
|
SiteError,
|
||||||
TimeoutError,
|
TimeoutError,
|
||||||
|
UnsupportedFileError,
|
||||||
} from "./error";
|
} from "./error";
|
||||||
import { executeTransformers } from "./transformers";
|
import { executeTransformers } from "./transformers";
|
||||||
import { LLMRefusalError } from "./transformers/llmExtract";
|
import { LLMRefusalError } from "./transformers/llmExtract";
|
||||||
@ -292,6 +293,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
throw error;
|
throw error;
|
||||||
} else if (error instanceof ActionError) {
|
} else if (error instanceof ActionError) {
|
||||||
throw error;
|
throw error;
|
||||||
|
} else if (error instanceof UnsupportedFileError) {
|
||||||
|
throw error;
|
||||||
} else {
|
} else {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
meta.logger.info(
|
meta.logger.info(
|
||||||
@ -414,6 +417,8 @@ export async function scrapeURL(
|
|||||||
meta.logger.warn("scrapeURL: Site failed to load in browser", { error });
|
meta.logger.warn("scrapeURL: Site failed to load in browser", { error });
|
||||||
} else if (error instanceof ActionError) {
|
} else if (error instanceof ActionError) {
|
||||||
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
|
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
|
||||||
|
} else if (error instanceof UnsupportedFileError) {
|
||||||
|
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { error });
|
||||||
} else {
|
} else {
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
meta.logger.error("scrapeURL: Unexpected error happened", { error });
|
meta.logger.error("scrapeURL: Unexpected error happened", { error });
|
||||||
|
Loading…
x
Reference in New Issue
Block a user