Nick: crawl fixes

This commit is contained in:
Nicolas 2024-12-03 16:25:55 -03:00
parent 1477ab2359
commit 52806807a1
2 changed files with 17 additions and 6 deletions

View File

@ -129,7 +129,7 @@ export async function crawlController(
priority: 20,
},
};
});
})
await lockURLs(
id,

View File

@ -158,11 +158,22 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
export async function lockURLs(id: string, sc: StoredCrawl, urls: string[]): Promise<boolean> {
urls = urls.map(url => {
return normalizeURL(url, sc);
});
const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
urls = urls.map(url => normalizeURL(url, sc));
// Add to visited_unique set
await redisConnection.sadd("crawl:" + id + ":visited_unique", ...urls);
await redisConnection.expire("crawl:" + id + ":visited_unique", 24 * 60 * 60, "NX");
let res: boolean;
if (!sc.crawlerOptions?.deduplicateSimilarURLs) {
const x = await redisConnection.sadd("crawl:" + id + ":visited", ...urls);
res = x === urls.length;
} else {
const allPermutations = urls.flatMap(url => generateURLPermutations(url).map(x => x.href));
const x = await redisConnection.sadd("crawl:" + id + ":visited", ...allPermutations);
res = x === allPermutations.length;
}
await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
return res;
}