2024-08-16 17:57:11 -03:00
|
|
|
import { Response } from "express";
|
|
|
|
|
import { v4 as uuidv4 } from "uuid";
|
2024-08-16 19:33:57 -04:00
|
|
|
import {
|
|
|
|
|
legacyCrawlerOptions,
|
|
|
|
|
mapRequestSchema,
|
|
|
|
|
RequestWithAuth,
|
|
|
|
|
} from "./types";
|
2024-08-16 17:57:11 -03:00
|
|
|
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
2024-08-16 19:33:57 -04:00
|
|
|
import { MapResponse, MapRequest } from "./types";
|
2024-08-16 17:57:11 -03:00
|
|
|
import { configDotenv } from "dotenv";
|
2024-08-16 19:33:57 -04:00
|
|
|
import {
|
|
|
|
|
checkAndUpdateURLForMap,
|
|
|
|
|
isSameDomain,
|
|
|
|
|
isSameSubdomain,
|
2024-08-26 16:56:27 -03:00
|
|
|
removeDuplicateUrls,
|
2024-08-16 19:33:57 -04:00
|
|
|
} from "../../lib/validateUrl";
|
|
|
|
|
import { fireEngineMap } from "../../search/fireEngine";
|
2024-08-28 15:40:30 -03:00
|
|
|
import { performCosineSimilarity } from "../../lib/map-cosine";
|
2024-09-03 21:09:32 -03:00
|
|
|
import { Logger } from "../../lib/logger";
|
2024-09-16 12:03:14 -04:00
|
|
|
import Redis from "ioredis";
|
2024-10-28 16:02:07 -03:00
|
|
|
import { billTeam } from "../../services/billing/credit_billing";
|
|
|
|
|
import { logJob } from "../../services/logging/log_job";
|
2024-08-16 17:57:11 -03:00
|
|
|
|
|
|
|
|
configDotenv();
|
2024-09-16 12:03:14 -04:00
|
|
|
const redis = new Redis(process.env.REDIS_URL);
|
|
|
|
|
|
|
|
|
|
// Max Links that /map can return
|
|
|
|
|
const MAX_MAP_LIMIT = 5000;
|
|
|
|
|
// Max Links that "Smart /map" can return
|
|
|
|
|
const MAX_FIRE_ENGINE_RESULTS = 1000;
|
2024-08-06 15:24:45 -03:00
|
|
|
|
2024-10-28 16:02:07 -03:00
|
|
|
interface MapOptions {
|
|
|
|
|
url: string;
|
|
|
|
|
search?: string;
|
|
|
|
|
limit?: number;
|
|
|
|
|
ignoreSitemap?: boolean;
|
|
|
|
|
includeSubdomains?: boolean;
|
|
|
|
|
crawlerOptions?: any;
|
|
|
|
|
teamId: string;
|
|
|
|
|
plan: string;
|
|
|
|
|
origin?: string;
|
|
|
|
|
subId?: string;
|
|
|
|
|
includeMetadata?: boolean;
|
|
|
|
|
}
|
2024-09-01 19:54:15 -03:00
|
|
|
|
2024-10-28 16:02:07 -03:00
|
|
|
export async function getMapResults({
|
|
|
|
|
url,
|
|
|
|
|
search,
|
|
|
|
|
limit = MAX_MAP_LIMIT,
|
|
|
|
|
ignoreSitemap = false,
|
|
|
|
|
includeSubdomains = false,
|
|
|
|
|
crawlerOptions = {},
|
|
|
|
|
teamId,
|
|
|
|
|
plan,
|
|
|
|
|
origin,
|
|
|
|
|
subId,
|
|
|
|
|
includeMetadata = false,
|
|
|
|
|
}: MapOptions) {
|
|
|
|
|
const startTime = new Date().getTime();
|
2024-08-16 17:57:11 -03:00
|
|
|
const id = uuidv4();
|
2024-10-28 16:02:07 -03:00
|
|
|
let links: { url: string; title?: string; description?: string }[] = [{ url }];
|
2024-08-16 17:57:11 -03:00
|
|
|
|
|
|
|
|
const sc: StoredCrawl = {
|
2024-10-28 16:02:07 -03:00
|
|
|
originUrl: url,
|
|
|
|
|
crawlerOptions,
|
2024-08-16 17:57:11 -03:00
|
|
|
pageOptions: {},
|
2024-10-28 16:02:07 -03:00
|
|
|
team_id: teamId,
|
2024-08-16 17:57:11 -03:00
|
|
|
createdAt: Date.now(),
|
2024-10-28 16:02:07 -03:00
|
|
|
plan,
|
2024-08-16 17:57:11 -03:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const crawler = crawlToCrawler(id, sc);
|
|
|
|
|
|
2024-10-28 16:02:07 -03:00
|
|
|
let urlWithoutWww = url.replace("www.", "");
|
|
|
|
|
let mapUrl = search ? `"${search}" site:${urlWithoutWww}` : `site:${url}`;
|
2024-09-06 20:14:47 -03:00
|
|
|
|
|
|
|
|
const resultsPerPage = 100;
|
2024-09-16 12:03:14 -04:00
|
|
|
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
|
2024-09-06 20:14:47 -03:00
|
|
|
|
2024-09-16 12:03:14 -04:00
|
|
|
const cacheKey = `fireEngineMap:${mapUrl}`;
|
2024-10-12 17:48:37 -03:00
|
|
|
const cachedResult = null;
|
2024-09-16 12:03:14 -04:00
|
|
|
|
|
|
|
|
let allResults: any[];
|
|
|
|
|
let pagePromises: Promise<any>[];
|
|
|
|
|
|
|
|
|
|
if (cachedResult) {
|
|
|
|
|
allResults = JSON.parse(cachedResult);
|
|
|
|
|
} else {
|
|
|
|
|
const fetchPage = async (page: number) => {
|
|
|
|
|
return fireEngineMap(mapUrl, {
|
|
|
|
|
numResults: resultsPerPage,
|
|
|
|
|
page: page,
|
|
|
|
|
});
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
|
|
|
|
|
allResults = await Promise.all(pagePromises);
|
|
|
|
|
|
2024-10-28 16:02:07 -03:00
|
|
|
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60);
|
2024-09-16 12:03:14 -04:00
|
|
|
}
|
2024-09-06 20:14:47 -03:00
|
|
|
|
2024-09-16 12:03:14 -04:00
|
|
|
const [sitemap, ...searchResults] = await Promise.all([
|
2024-10-28 16:02:07 -03:00
|
|
|
ignoreSitemap ? null : crawler.tryGetSitemap(),
|
2024-09-16 12:03:14 -04:00
|
|
|
...(cachedResult ? [] : pagePromises),
|
2024-09-06 20:14:47 -03:00
|
|
|
]);
|
2024-08-16 17:57:11 -03:00
|
|
|
|
2024-09-16 12:03:14 -04:00
|
|
|
if (!cachedResult) {
|
|
|
|
|
allResults = searchResults;
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-16 17:57:11 -03:00
|
|
|
if (sitemap !== null) {
|
2024-09-06 20:14:47 -03:00
|
|
|
sitemap.forEach((x) => {
|
2024-10-28 16:02:07 -03:00
|
|
|
links.push({ url: x.url });
|
2024-08-16 19:33:57 -04:00
|
|
|
});
|
2024-08-16 17:57:11 -03:00
|
|
|
}
|
|
|
|
|
|
2024-09-16 12:03:14 -04:00
|
|
|
let mapResults = allResults
|
|
|
|
|
.flat()
|
|
|
|
|
.filter((result) => result !== null && result !== undefined);
|
2024-08-26 16:56:27 -03:00
|
|
|
|
2024-09-16 12:03:14 -04:00
|
|
|
const minumumCutoff = Math.min(MAX_MAP_LIMIT, limit);
|
|
|
|
|
if (mapResults.length > minumumCutoff) {
|
|
|
|
|
mapResults = mapResults.slice(0, minumumCutoff);
|
2024-09-06 20:14:47 -03:00
|
|
|
}
|
2024-08-16 19:33:57 -04:00
|
|
|
|
|
|
|
|
if (mapResults.length > 0) {
|
2024-10-28 16:02:07 -03:00
|
|
|
if (search) {
|
2024-08-26 16:56:27 -03:00
|
|
|
links = [
|
2024-10-28 16:02:07 -03:00
|
|
|
{ url: mapResults[0].url, title: mapResults[0].title, description: mapResults[0].description },
|
|
|
|
|
...mapResults.slice(1).map((x) => ({
|
|
|
|
|
url: x.url,
|
|
|
|
|
title: x.title,
|
|
|
|
|
description: x.description
|
|
|
|
|
})),
|
2024-08-26 16:56:27 -03:00
|
|
|
...links,
|
|
|
|
|
];
|
2024-08-20 12:17:53 -03:00
|
|
|
} else {
|
2024-10-28 16:02:07 -03:00
|
|
|
mapResults.forEach((x) => {
|
|
|
|
|
links.push({
|
|
|
|
|
url: x.url,
|
|
|
|
|
title: x.title,
|
|
|
|
|
description: x.description
|
|
|
|
|
});
|
2024-08-20 12:17:53 -03:00
|
|
|
});
|
|
|
|
|
}
|
2024-08-16 17:57:11 -03:00
|
|
|
}
|
|
|
|
|
|
2024-10-28 16:02:07 -03:00
|
|
|
if (search) {
|
|
|
|
|
const filteredLinks = performCosineSimilarity(links.map(l => l.url), search.toLowerCase());
|
|
|
|
|
links = links.filter(l => filteredLinks.includes(l.url));
|
2024-08-28 15:40:30 -03:00
|
|
|
}
|
|
|
|
|
|
2024-09-16 12:03:14 -04:00
|
|
|
links = links
|
|
|
|
|
.map((x) => {
|
|
|
|
|
try {
|
2024-10-28 16:02:07 -03:00
|
|
|
return { ...x, url: checkAndUpdateURLForMap(x.url).url.trim() };
|
2024-09-16 12:03:14 -04:00
|
|
|
} catch (_) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
.filter((x) => x !== null);
|
2024-08-20 12:17:53 -03:00
|
|
|
|
2024-10-28 16:02:07 -03:00
|
|
|
links = links.filter((x) => isSameDomain(x.url, url));
|
2024-08-16 19:33:57 -04:00
|
|
|
|
2024-10-28 16:02:07 -03:00
|
|
|
if (!includeSubdomains) {
|
|
|
|
|
links = links.filter((x) => isSameSubdomain(x.url, url));
|
2024-08-16 19:33:57 -04:00
|
|
|
}
|
|
|
|
|
|
2024-10-28 16:02:07 -03:00
|
|
|
links = removeDuplicateUrls(links.map(l => l.url)).map(url => links.find(l => l.url === url));
|
2024-08-20 16:43:46 -03:00
|
|
|
|
2024-08-26 16:56:27 -03:00
|
|
|
const endTime = new Date().getTime();
|
|
|
|
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
|
|
|
|
|
2024-08-27 11:11:59 -03:00
|
|
|
const linksToReturn = links.slice(0, limit);
|
2024-09-16 12:03:14 -04:00
|
|
|
|
2024-10-28 16:02:07 -03:00
|
|
|
return {
|
|
|
|
|
links: includeMetadata ? linksToReturn : linksToReturn.map(l => l.url),
|
|
|
|
|
scrapeId: origin?.includes("website") ? id : undefined,
|
|
|
|
|
timeTakenInSeconds,
|
|
|
|
|
id,
|
|
|
|
|
linksLength: links.length,
|
|
|
|
|
linksToReturnLength: linksToReturn.length,
|
|
|
|
|
docs: linksToReturn.map(l => l.url),
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export async function mapController(
|
|
|
|
|
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
|
|
|
|
res: Response<MapResponse>
|
|
|
|
|
) {
|
|
|
|
|
req.body = mapRequestSchema.parse(req.body);
|
|
|
|
|
|
|
|
|
|
const results = await getMapResults({
|
|
|
|
|
url: req.body.url,
|
|
|
|
|
search: req.body.search,
|
|
|
|
|
limit: req.body.limit,
|
|
|
|
|
ignoreSitemap: req.body.ignoreSitemap,
|
|
|
|
|
includeSubdomains: req.body.includeSubdomains,
|
|
|
|
|
crawlerOptions: legacyCrawlerOptions(req.body),
|
|
|
|
|
teamId: req.auth.team_id,
|
|
|
|
|
plan: req.auth.plan,
|
|
|
|
|
origin: req.body.origin,
|
|
|
|
|
subId: req.acuc?.sub_id,
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
await billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
|
|
|
|
Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
await logJob({
|
|
|
|
|
job_id: results.id,
|
|
|
|
|
success: results.linksLength > 0,
|
2024-08-26 16:56:27 -03:00
|
|
|
message: "Map completed",
|
2024-10-28 16:02:07 -03:00
|
|
|
num_docs: results.linksToReturnLength,
|
|
|
|
|
docs: results.docs,
|
|
|
|
|
time_taken: results.timeTakenInSeconds,
|
2024-08-26 16:56:27 -03:00
|
|
|
team_id: req.auth.team_id,
|
|
|
|
|
mode: "map",
|
|
|
|
|
url: req.body.url,
|
|
|
|
|
crawlerOptions: {},
|
|
|
|
|
pageOptions: {},
|
|
|
|
|
origin: req.body.origin,
|
|
|
|
|
extractor_options: { mode: "markdown" },
|
|
|
|
|
num_tokens: 0,
|
|
|
|
|
});
|
|
|
|
|
|
2024-08-16 17:57:11 -03:00
|
|
|
return res.status(200).json({
|
|
|
|
|
success: true,
|
2024-10-28 16:02:07 -03:00
|
|
|
links: results.links.map(l => l.url),
|
|
|
|
|
scrape_id: results.scrapeId,
|
2024-08-16 17:57:11 -03:00
|
|
|
});
|
2024-08-06 15:24:45 -03:00
|
|
|
}
|