mirror of
				https://github.com/mendableai/firecrawl.git
				synced 2025-11-03 19:43:01 +00:00 
			
		
		
		
	fix(crawl-redis): normalize URL before locking
This commit is contained in:
		
							parent
							
								
									96245e387d
								
							
						
					
					
						commit
						fe721fffbe
					
				@ -1,5 +1,6 @@
 | 
				
			|||||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
 | 
					import { WebCrawler } from "../scraper/WebScraper/crawler";
 | 
				
			||||||
import { redisConnection } from "../services/queue-service";
 | 
					import { redisConnection } from "../services/queue-service";
 | 
				
			||||||
 | 
					import { Logger } from "./logger";
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export type StoredCrawl = {
 | 
					export type StoredCrawl = {
 | 
				
			||||||
    originUrl: string;
 | 
					    originUrl: string;
 | 
				
			||||||
@ -88,6 +89,16 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
 | 
				
			|||||||
            return false;
 | 
					            return false;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    try {
 | 
				
			||||||
 | 
					        const urlO = new URL(url);
 | 
				
			||||||
 | 
					        urlO.search = "";
 | 
				
			||||||
 | 
					        urlO.hash = "";
 | 
				
			||||||
 | 
					        url = urlO.href;
 | 
				
			||||||
 | 
					    } catch (error) {
 | 
				
			||||||
 | 
					        Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
 | 
					    const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0
 | 
				
			||||||
    await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
 | 
					    await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
 | 
				
			||||||
    return res;
 | 
					    return res;
 | 
				
			||||||
@ -95,6 +106,19 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
 | 
					/// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap
 | 
				
			||||||
export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
 | 
					export async function lockURLs(id: string, urls: string[]): Promise<boolean> {
 | 
				
			||||||
 | 
					    urls = urls.map(url => {
 | 
				
			||||||
 | 
					        try {
 | 
				
			||||||
 | 
					            const urlO = new URL(url);
 | 
				
			||||||
 | 
					            urlO.search = "";
 | 
				
			||||||
 | 
					            urlO.hash = "";
 | 
				
			||||||
 | 
					            return urlO.href;
 | 
				
			||||||
 | 
					        } catch (error) {
 | 
				
			||||||
 | 
					            Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return url;
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
 | 
					    const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0
 | 
				
			||||||
    await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
 | 
					    await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX");
 | 
				
			||||||
    return res;
 | 
					    return res;
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user