From 39ff49a8f3483a925d724e86cb46c9ff6f46c4ef Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 4 Dec 2024 12:42:56 -0300 Subject: [PATCH] Nick: reverted redirect fix --- apps/api/src/services/queue-worker.ts | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 61748312..140e0142 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -423,14 +423,18 @@ async function processJob(job: Job & { id: string }, token: string) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc)) { - logger.debug("Was redirected, removing old URL and locking new URL..."); - // Remove the old URL from visited sets - await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", normalizeURL(doc.metadata.sourceURL, sc)); - if (sc.crawlerOptions?.deduplicateSimilarURLs) { - const permutations = generateURLPermutations(doc.metadata.sourceURL).map(x => x.href); - await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", ...permutations); - } - await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(doc.metadata.sourceURL, sc)); + // console.log("Original URL: ", doc.metadata.sourceURL); + // console.log("New URL: ", doc.metadata.url); + // console.log("Normalized original URL: ", normalizeURL(doc.metadata.sourceURL, sc)); + // console.log("Normalized new URL: ", normalizeURL(doc.metadata.url, sc)); + // logger.debug("Was redirected, removing old URL and locking new URL..."); + // // Remove the old URL from visited sets + // await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", normalizeURL(doc.metadata.sourceURL, sc)); + // if (sc.crawlerOptions?.deduplicateSimilarURLs) { + // const permutations = generateURLPermutations(doc.metadata.sourceURL).map(x => x.href); + // await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", ...permutations); + // } + // await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(doc.metadata.sourceURL, sc)); // Lock the new URL await lockURL(job.data.crawl_id, sc, doc.metadata.url);