mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-12-27 15:13:26 +00:00
Merge branch 'main' into nsc/extract-url-trace
This commit is contained in:
commit
ece95e97f4
@ -3,7 +3,7 @@ import * as Sentry from "@sentry/node";
|
||||
import { z } from "zod";
|
||||
|
||||
import { robustFetch } from "../../lib/fetch";
|
||||
import { ActionError, EngineError, SiteError } from "../../error";
|
||||
import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error";
|
||||
|
||||
const successSchema = z.object({
|
||||
jobId: z.string(),
|
||||
@ -35,6 +35,12 @@ const successSchema = z.object({
|
||||
})
|
||||
.array()
|
||||
.optional(),
|
||||
|
||||
// chrome-cdp only -- file download handler
|
||||
file: z.object({
|
||||
name: z.string(),
|
||||
content: z.string(),
|
||||
}).optional().or(z.null()),
|
||||
});
|
||||
|
||||
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
||||
@ -111,6 +117,11 @@ export async function fireEngineCheckStatus(
|
||||
status.error.includes("Chrome error: ")
|
||||
) {
|
||||
throw new SiteError(status.error.split("Chrome error: ")[1]);
|
||||
} else if (
|
||||
typeof status.error === "string" &&
|
||||
status.error.includes("File size exceeds")
|
||||
) {
|
||||
throw new UnsupportedFileError("File size exceeds " + status.error.split("File size exceeds ")[1]);
|
||||
} else if (
|
||||
typeof status.error === "string" &&
|
||||
// TODO: improve this later
|
||||
|
||||
@ -13,7 +13,7 @@ import {
|
||||
FireEngineCheckStatusSuccess,
|
||||
StillProcessingError,
|
||||
} from "./checkStatus";
|
||||
import { ActionError, EngineError, SiteError, TimeoutError } from "../../error";
|
||||
import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError } from "../../error";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { Action } from "../../../../lib/entities";
|
||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||
@ -71,7 +71,8 @@ async function performFireEngineScrape<
|
||||
} else if (
|
||||
error instanceof EngineError ||
|
||||
error instanceof SiteError ||
|
||||
error instanceof ActionError
|
||||
error instanceof ActionError ||
|
||||
error instanceof UnsupportedFileError
|
||||
) {
|
||||
logger.debug("Fire-engine scrape job failed.", {
|
||||
error,
|
||||
@ -91,6 +92,19 @@ async function performFireEngineScrape<
|
||||
await new Promise((resolve) => setTimeout(resolve, 250));
|
||||
}
|
||||
|
||||
specialtyScrapeCheck(
|
||||
logger.child({
|
||||
method: "performFireEngineScrape/specialtyScrapeCheck",
|
||||
}),
|
||||
status.responseHeaders,
|
||||
);
|
||||
|
||||
if (status.file) {
|
||||
const content = status.file.content;
|
||||
delete status.file;
|
||||
status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
@ -160,13 +174,6 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
timeout,
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(
|
||||
meta.logger.child({
|
||||
method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck",
|
||||
}),
|
||||
response.responseHeaders,
|
||||
);
|
||||
|
||||
if (
|
||||
meta.options.formats.includes("screenshot") ||
|
||||
meta.options.formats.includes("screenshot@fullPage")
|
||||
@ -241,13 +248,6 @@ export async function scrapeURLWithFireEnginePlaywright(
|
||||
timeout,
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(
|
||||
meta.logger.child({
|
||||
method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck",
|
||||
}),
|
||||
response.responseHeaders,
|
||||
);
|
||||
|
||||
if (!response.url) {
|
||||
meta.logger.warn("Fire-engine did not return the response's URL", {
|
||||
response,
|
||||
@ -301,13 +301,6 @@ export async function scrapeURLWithFireEngineTLSClient(
|
||||
timeout,
|
||||
);
|
||||
|
||||
specialtyScrapeCheck(
|
||||
meta.logger.child({
|
||||
method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck",
|
||||
}),
|
||||
response.responseHeaders,
|
||||
);
|
||||
|
||||
if (!response.url) {
|
||||
meta.logger.warn("Fire-engine did not return the response's URL", {
|
||||
response,
|
||||
|
||||
@ -32,6 +32,9 @@ async function scrapePDFWithLlamaParse(
|
||||
tempFilePath,
|
||||
) as unknown as ReadableStream<Uint8Array>;
|
||||
},
|
||||
bytes() {
|
||||
throw Error("Unimplemented in mock Blob: bytes");
|
||||
},
|
||||
arrayBuffer() {
|
||||
throw Error("Unimplemented in mock Blob: arrayBuffer");
|
||||
},
|
||||
|
||||
@ -64,3 +64,11 @@ export class ActionError extends Error {
|
||||
this.code = code;
|
||||
}
|
||||
}
|
||||
|
||||
export class UnsupportedFileError extends Error {
|
||||
public reason: string;
|
||||
constructor(reason: string) {
|
||||
super("Scrape resulted in unsupported file: " + reason);
|
||||
this.reason = reason;
|
||||
}
|
||||
}
|
||||
|
||||
@ -19,6 +19,7 @@ import {
|
||||
RemoveFeatureError,
|
||||
SiteError,
|
||||
TimeoutError,
|
||||
UnsupportedFileError,
|
||||
} from "./error";
|
||||
import { executeTransformers } from "./transformers";
|
||||
import { LLMRefusalError } from "./transformers/llmExtract";
|
||||
@ -292,6 +293,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
throw error;
|
||||
} else if (error instanceof ActionError) {
|
||||
throw error;
|
||||
} else if (error instanceof UnsupportedFileError) {
|
||||
throw error;
|
||||
} else {
|
||||
Sentry.captureException(error);
|
||||
meta.logger.info(
|
||||
@ -414,6 +417,8 @@ export async function scrapeURL(
|
||||
meta.logger.warn("scrapeURL: Site failed to load in browser", { error });
|
||||
} else if (error instanceof ActionError) {
|
||||
meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
|
||||
} else if (error instanceof UnsupportedFileError) {
|
||||
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { error });
|
||||
} else {
|
||||
Sentry.captureException(error);
|
||||
meta.logger.error("scrapeURL: Unexpected error happened", { error });
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "1.10.0",
|
||||
"version": "1.10.1",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
|
||||
@ -470,7 +470,7 @@ export default class FirecrawlApp {
|
||||
let statusData = response.data
|
||||
if ("data" in statusData) {
|
||||
let data = statusData.data;
|
||||
while ('next' in statusData) {
|
||||
while (typeof statusData === 'object' && 'next' in statusData) {
|
||||
statusData = (await this.getRequest(statusData.next, headers)).data;
|
||||
data = data.concat(statusData.data);
|
||||
}
|
||||
@ -704,7 +704,7 @@ export default class FirecrawlApp {
|
||||
let statusData = response.data
|
||||
if ("data" in statusData) {
|
||||
let data = statusData.data;
|
||||
while ('next' in statusData) {
|
||||
while (typeof statusData === 'object' && 'next' in statusData) {
|
||||
statusData = (await this.getRequest(statusData.next, headers)).data;
|
||||
data = data.concat(statusData.data);
|
||||
}
|
||||
@ -863,42 +863,46 @@ export default class FirecrawlApp {
|
||||
headers: AxiosRequestHeaders,
|
||||
checkInterval: number
|
||||
): Promise<CrawlStatusResponse | ErrorResponse> {
|
||||
while (true) {
|
||||
let statusResponse: AxiosResponse = await this.getRequest(
|
||||
`${this.apiUrl}/v1/crawl/${id}`,
|
||||
headers
|
||||
);
|
||||
if (statusResponse.status === 200) {
|
||||
let statusData = statusResponse.data;
|
||||
if (statusData.status === "completed") {
|
||||
if ("data" in statusData) {
|
||||
let data = statusData.data;
|
||||
while ('next' in statusData) {
|
||||
statusResponse = await this.getRequest(statusData.next, headers);
|
||||
statusData = statusResponse.data;
|
||||
data = data.concat(statusData.data);
|
||||
try {
|
||||
while (true) {
|
||||
let statusResponse: AxiosResponse = await this.getRequest(
|
||||
`${this.apiUrl}/v1/crawl/${id}`,
|
||||
headers
|
||||
);
|
||||
if (statusResponse.status === 200) {
|
||||
let statusData = statusResponse.data;
|
||||
if (statusData.status === "completed") {
|
||||
if ("data" in statusData) {
|
||||
let data = statusData.data;
|
||||
while (typeof statusData === 'object' && 'next' in statusData) {
|
||||
statusResponse = await this.getRequest(statusData.next, headers);
|
||||
statusData = statusResponse.data;
|
||||
data = data.concat(statusData.data);
|
||||
}
|
||||
statusData.data = data;
|
||||
return statusData;
|
||||
} else {
|
||||
throw new FirecrawlError("Crawl job completed but no data was returned", 500);
|
||||
}
|
||||
statusData.data = data;
|
||||
return statusData;
|
||||
} else {
|
||||
throw new FirecrawlError("Crawl job completed but no data was returned", 500);
|
||||
}
|
||||
} else if (
|
||||
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
|
||||
) {
|
||||
checkInterval = Math.max(checkInterval, 2);
|
||||
await new Promise((resolve) =>
|
||||
setTimeout(resolve, checkInterval * 1000)
|
||||
);
|
||||
} else if (
|
||||
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
|
||||
) {
|
||||
checkInterval = Math.max(checkInterval, 2);
|
||||
await new Promise((resolve) =>
|
||||
setTimeout(resolve, checkInterval * 1000)
|
||||
);
|
||||
} else {
|
||||
throw new FirecrawlError(
|
||||
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
||||
500
|
||||
);
|
||||
}
|
||||
} else {
|
||||
throw new FirecrawlError(
|
||||
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
||||
500
|
||||
);
|
||||
this.handleError(statusResponse, "check crawl status");
|
||||
}
|
||||
} else {
|
||||
this.handleError(statusResponse, "check crawl status");
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new FirecrawlError(error, 500);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user