feat(crawl-status): refactor to work after a redis flush (#1664)

2025-06-27 00:41:33 +00:00 · 2025-06-18 18:58:04 +02:00 · 2025-06-18 18:58:04 +02:00 · ebc1de9d60
commit ebc1de9d60
parent cd2e0f868c
1 changed files with 164 additions and 76 deletions
--- a/apps/api/src/controllers/v1/crawl-status.ts
+++ b/apps/api/src/controllers/v1/crawl-status.ts
@ -135,69 +135,173 @@ export async function crawlStatusController(
  res: Response<CrawlStatusResponse>,
  isBatch = false,
 ) {
  const start =
      typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0;
    const end =
      typeof req.query.limit === "string"
        ? start + parseInt(req.query.limit, 10) - 1
        : undefined;
  const sc = await getCrawl(req.params.jobId);
-  if (!sc) {
+
  let status: Exclude<CrawlStatusResponse, ErrorResponse>["status"];
  let doneJobsLength: number;
  let doneJobsOrder: string[];
  let totalCount: number;
  let creditsUsed: number;
  if (sc) {
    if (sc.team_id !== req.auth.team_id) {
      return res.status(403).json({ success: false, error: "Forbidden" });
    }
    let jobIDs = await getCrawlJobs(req.params.jobId);
    let jobStatuses = await Promise.all(
      jobIDs.map(
        async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
      ),
    );
    const teamThrottledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id);
    const crawlThrottledJobsSet = sc.crawlerOptions?.delay ? await getCrawlConcurrencyLimitedJobs(req.params.jobId) : new Set();
    const throttledJobsSet = new Set([...teamThrottledJobsSet, ...crawlThrottledJobsSet]);
    const validJobStatuses: [string, JobState | "unknown"][] = [];
    const validJobIDs: string[] = [];
    for (const [id, status] of jobStatuses) {
      if (throttledJobsSet.has(id)) {
        validJobStatuses.push([id, "prioritized"]);
        validJobIDs.push(id);
      } else if (
        status !== "failed" &&
        status !== "unknown"
      ) {
        validJobStatuses.push([id, status]);
        validJobIDs.push(id);
      }
    }
    status =
      sc.cancelled
        ? "cancelled"
        : validJobStatuses.every((x) => x[1] === "completed") &&
            (sc.crawlerOptions
              ? await isCrawlKickoffFinished(req.params.jobId)
              : true)
          ? "completed"
          : "scraping";
    // Use validJobIDs instead of jobIDs for further processing
    jobIDs = validJobIDs;
    doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
    doneJobsOrder = await getDoneJobsOrdered(
      req.params.jobId,
      start,
      end ?? -1,
    );
    totalCount = jobIDs.length;
    if (totalCount === 0 && process.env.USE_DB_AUTHENTICATION === "true") {
      const x = await supabase_rr_service
        .from('firecrawl_jobs')
        .select('*', { count: 'exact', head: true })
        .eq("crawl_id", req.params.jobId)
        .eq("success", true)
      totalCount = x.count ?? 0;
    }
    creditsUsed = totalCount * (
      sc.scrapeOptions?.extract
        ? 5
        : 1
    )
  } else if (process.env.USE_DB_AUTHENTICATION === "true") {
    const scrapeJobCount = await supabase_rr_service
      .from("firecrawl_jobs")
      .select("*", { count: "exact", head: true })
      .eq("crawl_id", req.params.jobId)
      .eq("team_id", req.auth.team_id)
      .eq("success", true)
      .throwOnError();
    const crawlJobQuery = await supabase_rr_service
      .from("firecrawl_jobs")
      .select("*")
      .eq("job_id", req.params.jobId)
      .limit(1)
      .throwOnError();
    if (!crawlJobQuery.data || crawlJobQuery.data.length === 0) {
      if (scrapeJobCount.count === 0) {
        return res.status(404).json({ success: false, error: "Job not found" });
      } else {
        status = "completed"; // fake completed to cut the losses
      }
    } else {
      status = crawlJobQuery.data[0].success ? "completed" : "failed";
    }
    const crawlJob = crawlJobQuery.data?.[0];
    if (crawlJob && crawlJob.team_id !== req.auth.team_id) {
      return res.status(403).json({ success: false, error: "Forbidden" });
    }
    if (
      crawlJob
      && new Date().valueOf() - new Date(crawlJob.date_added).valueOf() > 24 * 60 * 60 * 1000
    ) {
      return res.status(404).json({ success: false, error: "Job expired" });
    }
    doneJobsLength = scrapeJobCount.count!;
    doneJobsOrder = [];
    const step = 1000;
    let i = 0;
    while (true) {
      const rangeStart = start + (i * step);
      let rangeEnd = start + ((i + 1) * step);
      if (end !== undefined) {
        rangeEnd = Math.min(end, rangeEnd);
      }
      const currentJobs = await supabase_rr_service
        .from("firecrawl_jobs")
        .select("job_id")
        .eq("crawl_id", req.params.jobId)
        .eq("team_id", req.acuc.team_id)
        .order("date_added", { ascending: true })
        .range(rangeStart, rangeEnd)
        .throwOnError();
      const rangeLength = rangeEnd - rangeStart;
      const data = currentJobs.data ?? [];
      doneJobsOrder.push(...data.map(x => x.job_id));
      if (data.length < rangeLength) {
        break;
      }
      if (rangeEnd === end) {
        break;
      }
      i++
    }
    totalCount = scrapeJobCount.count ?? 0;
    creditsUsed = crawlJob?.credits_billed ?? totalCount;
  } else {
    return res.status(404).json({ success: false, error: "Job not found" });
  }
  if (sc.team_id !== req.auth.team_id) {
    return res.status(403).json({ success: false, error: "Forbidden" });
  }
  const start =
    typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0;
  const end =
    typeof req.query.limit === "string"
      ? start + parseInt(req.query.limit, 10) - 1
      : undefined;
  let jobIDs = await getCrawlJobs(req.params.jobId);
  let jobStatuses = await Promise.all(
    jobIDs.map(
      async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
    ),
  );
  const teamThrottledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id);
  const crawlThrottledJobsSet = sc.crawlerOptions?.delay ? await getCrawlConcurrencyLimitedJobs(req.params.jobId) : new Set();
  const throttledJobsSet = new Set([...teamThrottledJobsSet, ...crawlThrottledJobsSet]);
  const validJobStatuses: [string, JobState | "unknown"][] = [];
  const validJobIDs: string[] = [];
  for (const [id, status] of jobStatuses) {
    if (throttledJobsSet.has(id)) {
      validJobStatuses.push([id, "prioritized"]);
      validJobIDs.push(id);
    } else if (
      status !== "failed" &&
      status !== "unknown"
    ) {
      validJobStatuses.push([id, status]);
      validJobIDs.push(id);
    }
  }
  const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
    sc.cancelled
      ? "cancelled"
      : validJobStatuses.every((x) => x[1] === "completed") &&
          (sc.crawlerOptions
            ? await isCrawlKickoffFinished(req.params.jobId)
            : true)
        ? "completed"
        : "scraping";
  // Use validJobIDs instead of jobIDs for further processing
  jobIDs = validJobIDs;
  const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
  const doneJobsOrder = await getDoneJobsOrdered(
    req.params.jobId,
    start,
    end ?? -1,
  );
  let doneJobs: PseudoJob<any>[] = [];
  if (end === undefined) {
@ -265,28 +369,12 @@ export async function crawlStatusController(
    nextURL.searchParams.set("limit", req.query.limit);
  }
  let totalCount = jobIDs.length;
  if (totalCount === 0 && process.env.USE_DB_AUTHENTICATION === "true") {
    const x = await supabase_rr_service
      .from('firecrawl_jobs')
      .select('*', { count: 'exact', head: true })
      .eq("crawl_id", req.params.jobId)
      .eq("success", true)
    totalCount = x.count ?? 0;
  }
  res.status(200).json({
    success: true,
    status,
    completed: doneJobsLength,
    total: totalCount,
-    creditsUsed: totalCount * (
+    creditsUsed,
      sc.scrapeOptions?.extract
        ? 5
        : 1
    ),
    expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
    next:
      status !== "scraping" && start + data.length === doneJobsLength // if there's not gonna be any documents after this