Nick: reverted redirect fix

This commit is contained in:
Nicolas 2024-12-04 12:42:56 -03:00
parent da96acdb94
commit 39ff49a8f3

View File

@ -423,14 +423,18 @@ async function processJob(job: Job & { id: string }, token: string) {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc)) { if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc)) {
logger.debug("Was redirected, removing old URL and locking new URL..."); // console.log("Original URL: ", doc.metadata.sourceURL);
// Remove the old URL from visited sets // console.log("New URL: ", doc.metadata.url);
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", normalizeURL(doc.metadata.sourceURL, sc)); // console.log("Normalized original URL: ", normalizeURL(doc.metadata.sourceURL, sc));
if (sc.crawlerOptions?.deduplicateSimilarURLs) { // console.log("Normalized new URL: ", normalizeURL(doc.metadata.url, sc));
const permutations = generateURLPermutations(doc.metadata.sourceURL).map(x => x.href); // logger.debug("Was redirected, removing old URL and locking new URL...");
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", ...permutations); // // Remove the old URL from visited sets
} // await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", normalizeURL(doc.metadata.sourceURL, sc));
await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(doc.metadata.sourceURL, sc)); // if (sc.crawlerOptions?.deduplicateSimilarURLs) {
// const permutations = generateURLPermutations(doc.metadata.sourceURL).map(x => x.href);
// await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", ...permutations);
// }
// await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(doc.metadata.sourceURL, sc));
// Lock the new URL // Lock the new URL
await lockURL(job.data.crawl_id, sc, doc.metadata.url); await lockURL(job.data.crawl_id, sc, doc.metadata.url);