2024-08-17 01:04:14 +02:00
|
|
|
import { authMiddleware } from "../../routes/v1";
|
|
|
|
import { RateLimiterMode } from "../../types";
|
2024-08-26 18:48:00 -03:00
|
|
|
import { authenticateUser } from "../auth";
|
2024-08-17 01:04:14 +02:00
|
|
|
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
|
|
|
import { WebSocket } from "ws";
|
|
|
|
import { v4 as uuidv4 } from "uuid";
|
|
|
|
import { Logger } from "../../lib/logger";
|
|
|
|
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
|
2024-08-23 18:27:00 +02:00
|
|
|
import { getScrapeQueue } from "../../services/queue-service";
|
2024-08-17 01:04:14 +02:00
|
|
|
import { getJob, getJobs } from "./crawl-status";
|
2024-08-23 19:55:41 +02:00
|
|
|
import * as Sentry from "@sentry/node";
|
2024-08-17 01:04:14 +02:00
|
|
|
|
|
|
|
type ErrorMessage = {
|
|
|
|
type: "error",
|
|
|
|
error: string,
|
|
|
|
}
|
|
|
|
|
|
|
|
type CatchupMessage = {
|
|
|
|
type: "catchup",
|
|
|
|
data: CrawlStatusResponse,
|
|
|
|
}
|
|
|
|
|
|
|
|
type DocumentMessage = {
|
|
|
|
type: "document",
|
|
|
|
data: Document,
|
|
|
|
}
|
|
|
|
|
|
|
|
type DoneMessage = { type: "done" }
|
|
|
|
|
|
|
|
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
|
|
|
|
|
|
|
|
function send(ws: WebSocket, msg: Message) {
|
|
|
|
if (ws.readyState === 1) {
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
ws.send(JSON.stringify(msg), (err) => {
|
|
|
|
if (err) reject(err);
|
|
|
|
else resolve(null);
|
|
|
|
});
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function close(ws: WebSocket, code: number, msg: Message) {
|
|
|
|
if (ws.readyState <= 1) {
|
|
|
|
ws.close(code, JSON.stringify(msg));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
|
|
|
const sc = await getCrawl(req.params.jobId);
|
|
|
|
if (!sc) {
|
|
|
|
return close(ws, 1008, { type: "error", error: "Job not found" });
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sc.team_id !== req.auth.team_id) {
|
|
|
|
return close(ws, 3003, { type: "error", error: "Forbidden" });
|
|
|
|
}
|
|
|
|
|
|
|
|
let doneJobIDs = [];
|
2024-08-23 19:55:41 +02:00
|
|
|
let finished = false;
|
2024-08-17 01:04:14 +02:00
|
|
|
|
2024-08-23 19:55:41 +02:00
|
|
|
const loop = async () => {
|
|
|
|
if (finished) return;
|
|
|
|
|
|
|
|
const jobIDs = await getCrawlJobs(req.params.jobId);
|
|
|
|
|
|
|
|
if (jobIDs.length === doneJobIDs.length) {
|
|
|
|
return close(ws, 1000, { type: "done" });
|
|
|
|
}
|
|
|
|
|
|
|
|
const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x));
|
|
|
|
const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
|
|
|
|
const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
|
|
|
|
|
|
|
|
for (const jobID of newlyDoneJobIDs) {
|
|
|
|
const job = await getJob(jobID);
|
|
|
|
|
|
|
|
if (job.returnvalue) {
|
2024-08-17 01:04:14 +02:00
|
|
|
send(ws, {
|
|
|
|
type: "document",
|
2024-08-23 19:55:41 +02:00
|
|
|
data: legacyDocumentConverter(job.returnvalue),
|
|
|
|
})
|
2024-08-17 01:04:14 +02:00
|
|
|
} else {
|
2024-08-23 19:55:41 +02:00
|
|
|
return close(ws, 3000, { type: "error", error: job.failedReason });
|
2024-08-17 01:04:14 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-29 20:01:00 +02:00
|
|
|
doneJobIDs.push(...newlyDoneJobIDs);
|
|
|
|
|
2024-08-23 19:55:41 +02:00
|
|
|
setTimeout(loop, 1000);
|
|
|
|
};
|
2024-08-17 01:04:14 +02:00
|
|
|
|
2024-08-23 19:55:41 +02:00
|
|
|
setTimeout(loop, 1000);
|
2024-08-17 01:04:14 +02:00
|
|
|
|
|
|
|
doneJobIDs = await getDoneJobsOrdered(req.params.jobId);
|
|
|
|
|
|
|
|
const jobIDs = await getCrawlJobs(req.params.jobId);
|
|
|
|
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
|
|
|
|
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
|
|
|
|
const doneJobs = await getJobs(doneJobIDs);
|
|
|
|
const data = doneJobs.map(x => x.returnvalue);
|
|
|
|
|
|
|
|
send(ws, {
|
|
|
|
type: "catchup",
|
|
|
|
data: {
|
2024-09-10 18:59:09 +02:00
|
|
|
success: true,
|
2024-08-17 01:04:14 +02:00
|
|
|
status,
|
2024-08-28 14:46:53 -03:00
|
|
|
total: jobIDs.length,
|
|
|
|
completed: doneJobIDs.length,
|
2024-08-17 01:04:14 +02:00
|
|
|
creditsUsed: jobIDs.length,
|
|
|
|
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
|
|
|
data: data.map(x => legacyDocumentConverter(x)),
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
if (status !== "scraping") {
|
2024-08-23 19:55:41 +02:00
|
|
|
finished = true;
|
2024-08-17 01:04:14 +02:00
|
|
|
return close(ws, 1000, { type: "done" });
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Basically just middleware and error wrapping
|
|
|
|
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
|
|
|
try {
|
|
|
|
const { success, team_id, error, status, plan } = await authenticateUser(
|
|
|
|
req,
|
|
|
|
null,
|
|
|
|
RateLimiterMode.CrawlStatus,
|
|
|
|
);
|
|
|
|
|
|
|
|
if (!success) {
|
|
|
|
return close(ws, 3000, {
|
|
|
|
type: "error",
|
|
|
|
error,
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
req.auth = { team_id, plan };
|
|
|
|
|
|
|
|
await crawlStatusWS(ws, req);
|
|
|
|
} catch (err) {
|
2024-08-23 19:55:41 +02:00
|
|
|
Sentry.captureException(err);
|
|
|
|
|
2024-08-17 01:04:14 +02:00
|
|
|
const id = uuidv4();
|
|
|
|
let verbose = JSON.stringify(err);
|
|
|
|
if (verbose === "{}") {
|
|
|
|
if (err instanceof Error) {
|
|
|
|
verbose = JSON.stringify({
|
|
|
|
message: err.message,
|
|
|
|
name: err.name,
|
|
|
|
stack: err.stack,
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
|
|
|
return close(ws, 1011, {
|
|
|
|
type: "error",
|
|
|
|
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|