feat(queue-worker): add more logs around crawl finishing logic

2025-12-27 15:13:26 +00:00 · 2025-05-09 16:52:38 +02:00 · 2025-05-09 16:52:38 +02:00 · fdeb01847d
commit fdeb01847d
parent 907cf1cf41
1 changed files with 29 additions and 8 deletions
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -109,7 +109,16 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
 const runningJobs: Set<string> = new Set();

 async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
+  const logger = _logger.child({
+    module: "queue-worker",
+    method: "finishCrawlIfNeeded",
+    jobId: job.id,
+    scrapeId: job.id,
+    crawlId: job.data.crawl_id,
+  });
+
  if (await finishCrawlPre(job.data.crawl_id)) {
+    logger.info("Crawl is pre-finished, checking if we need to add more jobs");
    if (
      job.data.crawlerOptions &&
      !(await redisConnection.exists(
@ -131,6 +140,10 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
        ),
      );

+      logger.info("Visited URLs", {
+        visitedUrls: visitedUrls.size,
+      });
+
      const lastUrls: string[] = (
        (
          await supabase_service.rpc("diff_get_last_crawl_urls", {
@ -142,6 +155,10 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {

      const lastUrlsSet = new Set(lastUrls);

+      logger.info("Last URLs", {
+        lastUrls: lastUrlsSet.size,
+      });
+
      const crawler = crawlToCrawler(
        job.data.crawl_id,
        sc,
@ -162,15 +179,12 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
          : sc.crawlerOptions.limit -
            (await getDoneJobsOrderedLength(job.data.crawl_id));

-      console.log(
-        sc.originUrl!,
-        univistedUrls,
-        visitedUrls,
-        lastUrls,
-        addableJobCount,
-      );
-
      if (univistedUrls.length !== 0 && addableJobCount > 0) {
+        logger.info("Adding jobs", {
+          univistedUrls: univistedUrls.length,
+          addableJobCount,
+        });
+
        const jobs = univistedUrls.slice(0, addableJobCount).map((url) => {
          const uuid = uuidv4();
          return {
@ -212,10 +226,15 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
        );
        await addScrapeJobs(lockedJobs);

+        logger.info("Added jobs, not going for the full finish", {
+          lockedJobs: lockedJobs.length,
+        });
+
        return;
      }
    }

+    logger.info("Finishing crawl");
    await finishCrawl(job.data.crawl_id);

    (async () => {
@ -267,6 +286,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
        )
        .filter((x) => x !== null);

+      logger.info("Logging crawl NOW!");
      await logJob({
        job_id: job.data.crawl_id,
        success: jobStatus === "completed",
@ -281,6 +301,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
        crawlerOptions: sc.crawlerOptions,
        origin: job.data.origin,
      });
+      logger.info("Logged crawl!");

      const data = {
        success: jobStatus !== "failed",