From 51a6b83f4517cf0ddab35c8e72380867c6235469 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Wed, 4 Dec 2024 12:56:47 -0300
Subject: [PATCH] Nick: fixed the crawl + n - not respecting limit

---
 apps/api/src/services/queue-worker.ts | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts
index 140e0142..101511a9 100644
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@@ -423,19 +423,10 @@ async function processJob(job: Job & { id: string }, token: string) {
       const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
     
       if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc)) {
-        // console.log("Original URL: ", doc.metadata.sourceURL);
-        // console.log("New URL: ", doc.metadata.url);
-        // console.log("Normalized original URL: ", normalizeURL(doc.metadata.sourceURL, sc));
-        // console.log("Normalized new URL: ", normalizeURL(doc.metadata.url, sc));
-        // logger.debug("Was redirected, removing old URL and locking new URL...");
-        // // Remove the old URL from visited sets
-        // await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", normalizeURL(doc.metadata.sourceURL, sc));
-        // if (sc.crawlerOptions?.deduplicateSimilarURLs) {
-        //   const permutations = generateURLPermutations(doc.metadata.sourceURL).map(x => x.href);
-        //   await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", ...permutations);
-        // }
-        // await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(doc.metadata.sourceURL, sc));
-        
+        logger.debug("Was redirected, removing old URL and locking new URL...");
+        // Remove the old URL from visited unique due to checking for limit
+        // Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL)
+        await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(doc.metadata.sourceURL, sc));
         // Lock the new URL
         await lockURL(job.data.crawl_id, sc, doc.metadata.url);
       }