import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; import { legacyCrawlerOptions, mapRequestSchema, RequestWithAuth, } from "./types"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; import { MapResponse, MapRequest } from "./types"; import { configDotenv } from "dotenv"; import { checkAndUpdateURLForMap, isSameDomain, isSameSubdomain, removeDuplicateUrls, } from "../../lib/validateUrl"; import { fireEngineMap } from "../../search/fireEngine"; import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; configDotenv(); export async function mapController( req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response ) { const startTime = new Date().getTime(); req.body = mapRequestSchema.parse(req.body); const limit = req.body.limit; const id = uuidv4(); let links: string[] = [req.body.url]; const sc: StoredCrawl = { originUrl: req.body.url, crawlerOptions: legacyCrawlerOptions(req.body), pageOptions: {}, team_id: req.auth.team_id, createdAt: Date.now(), }; const crawler = crawlToCrawler(id, sc); const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap(); if (sitemap !== null) { sitemap.map((x) => { links.push(x.url); }); } let urlWithoutWww = req.body.url.replace("www.", ""); let mapUrl = req.body.search ? `"${req.body.search}" site:${urlWithoutWww}` : `site:${req.body.url}`; // www. seems to exclude subdomains in some cases const mapResults = await fireEngineMap(mapUrl, { // limit to 50 results (beta) numResults: Math.min(limit, 50), }); if (mapResults.length > 0) { if (req.body.search) { // Ensure all map results are first, maintaining their order links = [ mapResults[0].url, ...mapResults.slice(1).map((x) => x.url), ...links, ]; } else { mapResults.map((x) => { links.push(x.url); }); } } links = links.map((x) => checkAndUpdateURLForMap(x).url.trim()); // allows for subdomains to be included links = links.filter((x) => isSameDomain(x, req.body.url)); // if includeSubdomains is false, filter out subdomains if (!req.body.includeSubdomains) { links = links.filter((x) => isSameSubdomain(x, req.body.url)); } // remove duplicates that could be due to http/https or www links = removeDuplicateUrls(links); await billTeam(req.auth.team_id, 1); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; logJob({ job_id: id, success: true, message: "Map completed", num_docs: 1, docs: links, time_taken: timeTakenInSeconds, team_id: req.auth.team_id, mode: "map", url: req.body.url, crawlerOptions: {}, pageOptions: {}, origin: req.body.origin, extractor_options: { mode: "markdown" }, num_tokens: 0, }); return res.status(200).json({ success: true, links: links.slice(0, limit), }); }