mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-09-26 08:52:50 +00:00
feat(scraper): runpod v2 parallel testing (#1636)
* feat(scraper): runpod v2 parallel testing * fix catch
This commit is contained in:
parent
b2e0f657bd
commit
4bf64d2c01
@ -7,14 +7,22 @@ import * as Sentry from "@sentry/node";
|
|||||||
import escapeHtml from "escape-html";
|
import escapeHtml from "escape-html";
|
||||||
import PdfParse from "pdf-parse";
|
import PdfParse from "pdf-parse";
|
||||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||||
import { PDFAntibotError, PDFInsufficientTimeError, RemoveFeatureError, TimeoutError } from "../../error";
|
import {
|
||||||
|
PDFAntibotError,
|
||||||
|
PDFInsufficientTimeError,
|
||||||
|
RemoveFeatureError,
|
||||||
|
TimeoutError,
|
||||||
|
} from "../../error";
|
||||||
import { readFile, unlink } from "node:fs/promises";
|
import { readFile, unlink } from "node:fs/promises";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
import type { Response } from "undici";
|
import type { Response } from "undici";
|
||||||
import { getPdfResultFromCache, savePdfResultToCache } from "../../../../lib/gcs-pdf-cache";
|
import {
|
||||||
|
getPdfResultFromCache,
|
||||||
|
savePdfResultToCache,
|
||||||
|
} from "../../../../lib/gcs-pdf-cache";
|
||||||
import { getPageCount } from "../../../../lib/pdf-parser";
|
import { getPageCount } from "../../../../lib/pdf-parser";
|
||||||
|
|
||||||
type PDFProcessorResult = { html: string; markdown?: string; };
|
type PDFProcessorResult = { html: string; markdown?: string };
|
||||||
|
|
||||||
const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB
|
const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB
|
||||||
const MILLISECONDS_PER_PAGE = 150;
|
const MILLISECONDS_PER_PAGE = 150;
|
||||||
@ -33,7 +41,7 @@ async function scrapePDFWithRunPodMU(
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
const cachedResult = await getPdfResultFromCache(base64Content);
|
const cachedResult = await getPdfResultFromCache(base64Content);
|
||||||
|
|
||||||
if (cachedResult) {
|
if (cachedResult) {
|
||||||
meta.logger.info("Using cached RunPod MU result for PDF", {
|
meta.logger.info("Using cached RunPod MU result for PDF", {
|
||||||
tempFilePath,
|
tempFilePath,
|
||||||
@ -47,7 +55,9 @@ async function scrapePDFWithRunPodMU(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const timeout = timeToRun ? timeToRun - (Date.now() - preCacheCheckStartTime) : undefined;
|
const timeout = timeToRun
|
||||||
|
? timeToRun - (Date.now() - preCacheCheckStartTime)
|
||||||
|
: undefined;
|
||||||
if (timeout && timeout < 0) {
|
if (timeout && timeout < 0) {
|
||||||
throw new TimeoutError("MU PDF parser already timed out before call");
|
throw new TimeoutError("MU PDF parser already timed out before call");
|
||||||
}
|
}
|
||||||
@ -75,21 +85,60 @@ async function scrapePDFWithRunPodMU(
|
|||||||
schema: z.object({
|
schema: z.object({
|
||||||
id: z.string(),
|
id: z.string(),
|
||||||
status: z.string(),
|
status: z.string(),
|
||||||
output: z.object({
|
output: z
|
||||||
markdown: z.string(),
|
.object({
|
||||||
}).optional(),
|
markdown: z.string(),
|
||||||
|
})
|
||||||
|
.optional(),
|
||||||
}),
|
}),
|
||||||
mock: meta.mock,
|
mock: meta.mock,
|
||||||
abort,
|
abort,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
//this is just so we can test in parallel and compare results
|
||||||
|
robustFetch({
|
||||||
|
url:
|
||||||
|
"https://api.runpod.ai/v2/" + process.env.RUNPOD_MUV2_POD_ID + "/runsync",
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`,
|
||||||
|
},
|
||||||
|
body: {
|
||||||
|
input: {
|
||||||
|
file_content: base64Content,
|
||||||
|
filename: path.basename(tempFilePath) + ".pdf",
|
||||||
|
timeout,
|
||||||
|
created_at: Date.now(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
logger: meta.logger.child({
|
||||||
|
method: "scrapePDFWithRunPodMU/runsync/robustFetch",
|
||||||
|
}),
|
||||||
|
schema: z.object({
|
||||||
|
id: z.string(),
|
||||||
|
status: z.string(),
|
||||||
|
output: z
|
||||||
|
.object({
|
||||||
|
markdown: z.string(),
|
||||||
|
})
|
||||||
|
.optional(),
|
||||||
|
}),
|
||||||
|
mock: meta.mock,
|
||||||
|
abort,
|
||||||
|
}).catch(error => {
|
||||||
|
meta.logger.warn("Error scraping PDF with RunPod MU V2", {
|
||||||
|
error,
|
||||||
|
tempFilePath,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
let status: string = podStart.status;
|
let status: string = podStart.status;
|
||||||
let result: { markdown: string } | undefined = podStart.output;
|
let result: { markdown: string } | undefined = podStart.output;
|
||||||
|
|
||||||
if (status === "IN_QUEUE" || status === "IN_PROGRESS") {
|
if (status === "IN_QUEUE" || status === "IN_PROGRESS") {
|
||||||
do {
|
do {
|
||||||
abort?.throwIfAborted();
|
abort?.throwIfAborted();
|
||||||
await new Promise(resolve => setTimeout(resolve, 2500));
|
await new Promise((resolve) => setTimeout(resolve, 2500));
|
||||||
abort?.throwIfAborted();
|
abort?.throwIfAborted();
|
||||||
const podStatus = await robustFetch({
|
const podStatus = await robustFetch({
|
||||||
url: `https://api.runpod.ai/v2/${process.env.RUNPOD_MU_POD_ID}/status/${podStart.id}`,
|
url: `https://api.runpod.ai/v2/${process.env.RUNPOD_MU_POD_ID}/status/${podStart.id}`,
|
||||||
@ -102,9 +151,11 @@ async function scrapePDFWithRunPodMU(
|
|||||||
}),
|
}),
|
||||||
schema: z.object({
|
schema: z.object({
|
||||||
status: z.string(),
|
status: z.string(),
|
||||||
output: z.object({
|
output: z
|
||||||
markdown: z.string(),
|
.object({
|
||||||
}).optional(),
|
markdown: z.string(),
|
||||||
|
})
|
||||||
|
.optional(),
|
||||||
}),
|
}),
|
||||||
mock: meta.mock,
|
mock: meta.mock,
|
||||||
abort,
|
abort,
|
||||||
@ -159,14 +210,16 @@ export async function scrapePDF(
|
|||||||
timeToRun: number | undefined,
|
timeToRun: number | undefined,
|
||||||
): Promise<EngineScrapeResult> {
|
): Promise<EngineScrapeResult> {
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
|
|
||||||
if (!meta.options.parsePDF) {
|
if (!meta.options.parsePDF) {
|
||||||
if (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null) {
|
if (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null) {
|
||||||
const content = (await readFile(meta.pdfPrefetch.filePath)).toString("base64");
|
const content = (await readFile(meta.pdfPrefetch.filePath)).toString(
|
||||||
|
"base64",
|
||||||
|
);
|
||||||
return {
|
return {
|
||||||
url: meta.pdfPrefetch.url ?? meta.url,
|
url: meta.pdfPrefetch.url ?? meta.url,
|
||||||
statusCode: meta.pdfPrefetch.status,
|
statusCode: meta.pdfPrefetch.status,
|
||||||
|
|
||||||
html: content,
|
html: content,
|
||||||
markdown: content,
|
markdown: content,
|
||||||
};
|
};
|
||||||
@ -174,40 +227,47 @@ export async function scrapePDF(
|
|||||||
const file = await fetchFileToBuffer(meta.url, {
|
const file = await fetchFileToBuffer(meta.url, {
|
||||||
headers: meta.options.headers,
|
headers: meta.options.headers,
|
||||||
});
|
});
|
||||||
|
|
||||||
const ct = file.response.headers.get("Content-Type");
|
const ct = file.response.headers.get("Content-Type");
|
||||||
if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
|
if (ct && !ct.includes("application/pdf")) {
|
||||||
|
// if downloaded file wasn't a PDF
|
||||||
throw new PDFAntibotError();
|
throw new PDFAntibotError();
|
||||||
}
|
}
|
||||||
|
|
||||||
const content = file.buffer.toString("base64");
|
const content = file.buffer.toString("base64");
|
||||||
return {
|
return {
|
||||||
url: file.response.url,
|
url: file.response.url,
|
||||||
statusCode: file.response.status,
|
statusCode: file.response.status,
|
||||||
|
|
||||||
html: content,
|
html: content,
|
||||||
markdown: content,
|
markdown: content,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const { response, tempFilePath } = (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null)
|
const { response, tempFilePath } =
|
||||||
? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
|
meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null
|
||||||
: await downloadFile(meta.id, meta.url, {
|
? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
|
||||||
headers: meta.options.headers,
|
: await downloadFile(meta.id, meta.url, {
|
||||||
});
|
headers: meta.options.headers,
|
||||||
|
});
|
||||||
if ((response as any).headers) { // if downloadFile was used
|
|
||||||
|
if ((response as any).headers) {
|
||||||
|
// if downloadFile was used
|
||||||
const r: Response = response as any;
|
const r: Response = response as any;
|
||||||
const ct = r.headers.get("Content-Type");
|
const ct = r.headers.get("Content-Type");
|
||||||
if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
|
if (ct && !ct.includes("application/pdf")) {
|
||||||
|
// if downloaded file wasn't a PDF
|
||||||
throw new PDFAntibotError();
|
throw new PDFAntibotError();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const pageCount = await getPageCount(tempFilePath);
|
const pageCount = await getPageCount(tempFilePath);
|
||||||
if (pageCount * MILLISECONDS_PER_PAGE > (timeToRun ?? Infinity)) {
|
if (pageCount * MILLISECONDS_PER_PAGE > (timeToRun ?? Infinity)) {
|
||||||
throw new PDFInsufficientTimeError(pageCount, pageCount * MILLISECONDS_PER_PAGE + 5000);
|
throw new PDFInsufficientTimeError(
|
||||||
|
pageCount,
|
||||||
|
pageCount * MILLISECONDS_PER_PAGE + 5000,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let result: PDFProcessorResult | null = null;
|
let result: PDFProcessorResult | null = null;
|
||||||
@ -229,20 +289,26 @@ export async function scrapePDF(
|
|||||||
}),
|
}),
|
||||||
},
|
},
|
||||||
tempFilePath,
|
tempFilePath,
|
||||||
timeToRun ? (timeToRun - (Date.now() - startTime)) : undefined,
|
timeToRun ? timeToRun - (Date.now() - startTime) : undefined,
|
||||||
base64Content,
|
base64Content,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (
|
if (
|
||||||
error instanceof RemoveFeatureError
|
error instanceof RemoveFeatureError ||
|
||||||
|| error instanceof TimeoutError
|
error instanceof TimeoutError
|
||||||
) {
|
) {
|
||||||
throw error;
|
throw error;
|
||||||
} else if (
|
} else if (
|
||||||
(error instanceof Error && error.name === "TimeoutError")
|
(error instanceof Error && error.name === "TimeoutError") ||
|
||||||
|| (error instanceof Error && error.message === "Request failed" && error.cause && error.cause instanceof Error && error.cause.name === "TimeoutError")
|
(error instanceof Error &&
|
||||||
|
error.message === "Request failed" &&
|
||||||
|
error.cause &&
|
||||||
|
error.cause instanceof Error &&
|
||||||
|
error.cause.name === "TimeoutError")
|
||||||
) {
|
) {
|
||||||
throw new TimeoutError("PDF parsing timed out, please increase the timeout parameter in your scrape request");
|
throw new TimeoutError(
|
||||||
|
"PDF parsing timed out, please increase the timeout parameter in your scrape request",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
meta.logger.warn(
|
meta.logger.warn(
|
||||||
"RunPod MU failed to parse PDF (could be due to timeout) -- falling back to parse-pdf",
|
"RunPod MU failed to parse PDF (could be due to timeout) -- falling back to parse-pdf",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user