Revert "feat(queue-worker): always crawl links from content even if sitemapped"

This reverts commit 3c045c43a446bb7895892338c881cd7bc4f77cbf.
2025-11-20 20:29:27 +00:00 · 2024-10-01 14:20:23 -03:00 · 2024-10-01 14:20:23 -03:00 · 258c67ce67
commit 258c67ce67
parent 445fc432e9
1 changed files with 39 additions and 31 deletions
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -300,41 +300,49 @@ async function processJob(job: Job, token: string) {
      const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
-      if (!sc.cancelled) {
+      if (!job.data.sitemapped) {
-        const crawler = crawlToCrawler(job.data.crawl_id, sc);
+        if (!sc.cancelled) {
          const crawler = crawlToCrawler(job.data.crawl_id, sc);
-        const links = crawler.filterLinks(
+          const links = crawler.filterLinks(
-          crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl),
+            crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl),
-          Infinity,
+            Infinity,
-          sc.crawlerOptions?.maxDepth ?? 10
+            sc.crawlerOptions?.maxDepth ?? 10
-        );
+          );
-        for (const link of links) {
+          for (const link of links) {
-          if (await lockURL(job.data.crawl_id, sc, link)) {
+            if (await lockURL(job.data.crawl_id, sc, link)) {
-            const jobPriority = await getJobPriority({
+              // This seems to work really welel
-              plan: sc.plan as PlanType,
+              const jobPriority = await getJobPriority({
-              team_id: sc.team_id,
+                plan: sc.plan as PlanType,
              basePriority: job.data.crawl_id ? 20 : 10,
            });
            const jobId = uuidv4();
            const newJob = await addScrapeJob(
              {
                url: link,
                mode: "single_urls",
                crawlerOptions: sc.crawlerOptions,
                team_id: sc.team_id,
-                pageOptions: sc.pageOptions,
+                basePriority: job.data.crawl_id ? 20 : 10,
-                origin: job.data.origin,
+              });
-                crawl_id: job.data.crawl_id,
+              const jobId = uuidv4();
                v1: job.data.v1,
              },
              {},
              jobId,
              jobPriority
            );
-            await addCrawlJob(job.data.crawl_id, newJob.id);
+              // console.log("plan: ",  sc.plan);
              // console.log("team_id: ", sc.team_id)
              // console.log("base priority: ", job.data.crawl_id ? 20 : 10)
              // console.log("job priority: " , jobPriority, "\n\n\n")
              const newJob = await addScrapeJob(
                {
                  url: link,
                  mode: "single_urls",
                  crawlerOptions: sc.crawlerOptions,
                  team_id: sc.team_id,
                  pageOptions: sc.pageOptions,
                  origin: job.data.origin,
                  crawl_id: job.data.crawl_id,
                  v1: job.data.v1,
                },
                {},
                jobId,
                jobPriority
              );
              await addCrawlJob(job.data.crawl_id, newJob.id);
            }
          }
        }
      }