mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-06-27 00:41:33 +00:00
hotfix: kill zombie workers, respect timeouts better (FIR-2034) (#1575)
* feat(scrapeURL): add strict timeouts everywhere * feat(queue-worker/liveness): add networking check * fix(queue-worker): typo * fix(queue-worker/liveness): do not parse * fix(queue-worker): check local network instead * fix(queue-worker/liveness): typo
This commit is contained in:
parent
5152019a05
commit
f838190ba6
@ -3,9 +3,10 @@ import { EngineScrapeResult } from "..";
|
||||
import { downloadFile } from "../utils/downloadFile";
|
||||
import mammoth from "mammoth";
|
||||
|
||||
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
|
||||
export async function scrapeDOCX(meta: Meta, timeToRun: number | undefined): Promise<EngineScrapeResult> {
|
||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
|
||||
headers: meta.options.headers,
|
||||
signal: meta.internalOptions.abort ?? AbortSignal.timeout(timeToRun ?? 300000),
|
||||
});
|
||||
|
||||
return {
|
||||
|
@ -59,7 +59,7 @@ export async function scrapeURLWithFetch(
|
||||
dispatcher: await makeSecureDispatcher(meta.url),
|
||||
redirect: "follow",
|
||||
headers: meta.options.headers,
|
||||
signal: meta.internalOptions.abort,
|
||||
signal: meta.internalOptions.abort ?? AbortSignal.timeout(timeout),
|
||||
}),
|
||||
(async () => {
|
||||
await new Promise((resolve) =>
|
||||
|
@ -142,6 +142,7 @@ export async function fireEngineCheckStatus(
|
||||
: {}),
|
||||
},
|
||||
mock,
|
||||
abort,
|
||||
});
|
||||
},
|
||||
);
|
||||
|
@ -9,6 +9,7 @@ export async function fireEngineDelete(
|
||||
logger: Logger,
|
||||
jobId: string,
|
||||
mock: MockState | null,
|
||||
abort?: AbortSignal,
|
||||
) {
|
||||
await Sentry.startSpan(
|
||||
{
|
||||
@ -33,6 +34,7 @@ export async function fireEngineDelete(
|
||||
ignoreFailure: true,
|
||||
logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }),
|
||||
mock,
|
||||
abort,
|
||||
});
|
||||
},
|
||||
);
|
||||
|
@ -48,6 +48,7 @@ async function performFireEngineScrape<
|
||||
logger.child({ method: "fireEngineScrape" }),
|
||||
request,
|
||||
mock,
|
||||
abort,
|
||||
);
|
||||
|
||||
const startTime = Date.now();
|
||||
@ -56,6 +57,7 @@ async function performFireEngineScrape<
|
||||
let status: FireEngineCheckStatusSuccess | undefined = undefined;
|
||||
|
||||
while (status === undefined) {
|
||||
abort?.throwIfAborted();
|
||||
if (errors.length >= errorLimit) {
|
||||
logger.error("Error limit hit.", { errors });
|
||||
fireEngineDelete(
|
||||
@ -236,7 +238,7 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
request,
|
||||
timeout,
|
||||
meta.mock,
|
||||
meta.internalOptions.abort,
|
||||
meta.internalOptions.abort ?? AbortSignal.timeout(timeout),
|
||||
);
|
||||
|
||||
if (
|
||||
@ -317,7 +319,7 @@ export async function scrapeURLWithFireEnginePlaywright(
|
||||
request,
|
||||
timeout,
|
||||
meta.mock,
|
||||
meta.internalOptions.abort,
|
||||
meta.internalOptions.abort ?? AbortSignal.timeout(timeout),
|
||||
);
|
||||
|
||||
if (!response.url) {
|
||||
@ -373,7 +375,7 @@ export async function scrapeURLWithFireEngineTLSClient(
|
||||
request,
|
||||
timeout,
|
||||
meta.mock,
|
||||
meta.internalOptions.abort,
|
||||
meta.internalOptions.abort ?? AbortSignal.timeout(timeout),
|
||||
);
|
||||
|
||||
if (!response.url) {
|
||||
|
@ -45,6 +45,8 @@ async function scrapePDFWithRunPodMU(
|
||||
});
|
||||
}
|
||||
|
||||
const timeout = timeToRun ? timeToRun - (Date.now() - preCacheCheckStartTime) : undefined;
|
||||
|
||||
const result = await robustFetch({
|
||||
url:
|
||||
"https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync",
|
||||
@ -56,7 +58,7 @@ async function scrapePDFWithRunPodMU(
|
||||
input: {
|
||||
file_content: base64Content,
|
||||
filename: path.basename(tempFilePath) + ".pdf",
|
||||
timeout: timeToRun ? timeToRun - (Date.now() - preCacheCheckStartTime) : undefined,
|
||||
timeout,
|
||||
created_at: Date.now(),
|
||||
},
|
||||
},
|
||||
@ -69,6 +71,7 @@ async function scrapePDFWithRunPodMU(
|
||||
}),
|
||||
}),
|
||||
mock: meta.mock,
|
||||
abort: timeout ? AbortSignal.timeout(timeout) : undefined,
|
||||
});
|
||||
|
||||
const processorResult = {
|
||||
|
@ -30,6 +30,7 @@ export async function scrapeURLWithPlaywright(
|
||||
pageError: z.string().optional(),
|
||||
}),
|
||||
mock: meta.mock,
|
||||
abort: AbortSignal.timeout(timeout),
|
||||
}),
|
||||
(async () => {
|
||||
await new Promise((resolve) => setTimeout(() => resolve(null), timeout));
|
||||
|
@ -85,6 +85,7 @@ import Express from "express";
|
||||
import http from "http";
|
||||
import https from "https";
|
||||
import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup";
|
||||
import { robustFetch } from "../scraper/scrapeURL/lib/fetch";
|
||||
|
||||
configDotenv();
|
||||
|
||||
@ -1546,10 +1547,25 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
const app = Express();
|
||||
|
||||
app.get("/liveness", (req, res) => {
|
||||
// stalled check
|
||||
if (isWorkerStalled) {
|
||||
res.status(500).json({ ok: false });
|
||||
} else {
|
||||
res.status(200).json({ ok: true });
|
||||
// networking check
|
||||
robustFetch({
|
||||
url: "http://firecrawl-app-service:3002",
|
||||
method: "GET",
|
||||
mock: null,
|
||||
logger: _logger,
|
||||
abort: AbortSignal.timeout(5000),
|
||||
ignoreResponse: true,
|
||||
})
|
||||
.then(() => {
|
||||
res.status(200).json({ ok: true });
|
||||
}).catch(e => {
|
||||
_logger.error("WORKER NETWORKING CHECK FAILED", { error: e });
|
||||
res.status(500).json({ ok: false });
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user