diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index 178213d0..e6d9c7c5 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -9,15 +9,9 @@ import { scrapeOptions, } from "./types"; import { Document } from "../../lib/entities"; -import { StoredCrawl, crawlToCrawler } from "../../lib/crawl-redis"; -import { fireEngineMap } from "../../search/fireEngine"; import Redis from "ioredis"; import { configDotenv } from "dotenv"; import { performRanking } from "../../lib/ranker"; -import { checkAndUpdateURLForMap } from "../../lib/validateUrl"; -import { isSameDomain } from "../../lib/validateUrl"; -import { isSameSubdomain } from "../../lib/validateUrl"; -import { removeDuplicateUrls } from "../../lib/validateUrl"; import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; import { logger } from "../../lib/logger"; @@ -28,6 +22,7 @@ import { PlanType } from "../../types"; import { getJobPriority } from "../../lib/job-priority"; import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; +import { getMapResults } from "./map"; configDotenv(); const redis = new Redis(process.env.REDIS_URL!); @@ -43,164 +38,68 @@ export async function extractController( req.body = extractRequestSchema.parse(req.body); const id = crypto.randomUUID(); - let links: string[]; //= req.body.urls; - - const sc: StoredCrawl = { - originUrl: req.body.urls[0], - crawlerOptions: { - // ...crawlerOptions, - scrapeOptions: undefined, - }, - scrapeOptions: scrapeOptions.parse({}), - internalOptions: {}, - team_id: req.auth.team_id, - createdAt: Date.now(), - plan: req.auth.plan!, - }; - - const crawler = crawlToCrawler(id, sc); - - let urlWithoutWww = req.body.urls[0].replace("www.", ""); - console.log("urlWithoutWww", urlWithoutWww); - - const allowExternalLinks = req.body.allowExternalLinks ?? false; - - let mapUrl = req.body.prompt && allowExternalLinks - ? `${req.body.prompt} ${urlWithoutWww}` - : req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}` - : `site:${urlWithoutWww}`; - - const resultsPerPage = 100; - const maxPages = Math.ceil(MAX_EXTRACT_LIMIT / resultsPerPage); - - const cacheKey = `fireEngineMap:${mapUrl}`; - const cachedResult = null; - - let allResults: any[] = []; - let pagePromises: Promise[] = []; - - if (cachedResult) { - allResults = JSON.parse(cachedResult); - } else { - const fetchPage = async (page: number) => { - return fireEngineMap(mapUrl, { - numResults: resultsPerPage, - page: page, - }); - }; - - pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1)); - allResults = (await Promise.all(pagePromises)).flat(); - // console.log("allResults", allResults); - // if allResults is empty, return an error - if (allResults.length === 0) { - return res.status(400).json({ - success: false, - error: "No results found", - }); - } - - await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours - } - - // console.log("allResults", allResults); - // Parallelize sitemap fetch with serper search - // const [sitemap, ...searchResults] = await Promise.all([ - // req.body.ignoreSitemap ? null : null, // crawler.tryGetSitemap(), - // ...(cachedResult ? [] : pagePromises), - // ]); - - // if (!cachedResult) { - // allResults = searchResults; - // } - - links = allResults.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`); - console.log("links", links); - // if (sitemap !== null) { - // sitemap.forEach((x) => { - // links.push(x.url); - // }); - // } - - // let mapResults = allResults - // .flat() - // .filter((result) => result !== null && result !== undefined); - - // const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT); - // if (mapResults.length > minumumCutoff) { - // mapResults = mapResults.slice(0, minumumCutoff); - // } - - // if (mapResults.length > 0) { - // if (req.body.prompt) { - // // Ensure all map results are first, maintaining their order - // links = [ - // mapResults[0].url, - // ...mapResults.slice(1).map((x) => x.url), - // ...links, - // ]; - // } else { - // mapResults.map((x) => { - // links.push(x.url); - // }); - // } - // } - - // console.log("mapResults", mapResults); - - // console.log("links", links); - let linksAndScores: { link: string; score: number }[] = []; - // Perform cosine similarity between the search query and the list of links - if (req.body.prompt) { - const searchQuery = mapUrl; //req.body.prompt.toLowerCase(); - linksAndScores = await performRanking(links, searchQuery); - } - console.log("linksAndScores", linksAndScores); - links = linksAndScores - .filter(x => x.score > SCORE_THRESHOLD) - .map(x => x.link.split("url: ")[1].split(",")[0]) - .filter(x => !isUrlBlocked(x)) - - console.log("links:", links.length); - - // should we use some sort of llm to determine the best links? - - // console.log("linksAndScores", linksAndScores); - - // links = links - // .map((x) => { - // try { - // return checkAndUpdateURLForMap(x).url.trim(); - // } catch (_) { - // return null; - // } - // }) - // .filter((x) => x !== null) as string[]; - - // allows for subdomains to be included - // links = links.filter((x) => isSameDomain(x, req.body.urls[0])); - - // if includeSubdomains is false, filter out subdomains - // if (!req.body.includeSubdomains) { - // links = links.filter((x) => isSameSubdomain(x, req.body.urls[0])); - // z} - - // remove duplicates that could be due to http/https or www - // links = removeDuplicateUrls(links); - - // get top N links - links = links.slice(0, MAX_RANKING_LIMIT); - - // scrape the links - let earlyReturn = false; + let links: string[] = []; let docs: Document[] = []; + const earlyReturn = false; + for (const url of req.body.urls) { + if (url.includes('/*')) { + // Handle glob pattern URLs + const baseUrl = url.replace('/*', ''); + const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any + + const allowExternalLinks = req.body.allowExternalLinks ?? true; + let urlWithoutWww = baseUrl.replace("www.", ""); + let mapUrl = req.body.prompt && allowExternalLinks + ? `${req.body.prompt} ${urlWithoutWww}` + : req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}` + : `site:${urlWithoutWww}`; + + const mapResults = await getMapResults({ + url: baseUrl, + search: req.body.prompt, + teamId: req.auth.team_id, + plan: req.auth.plan, + allowExternalLinks, + origin: req.body.origin, + limit: req.body.limit, + ignoreSitemap: false, + includeMetadata: true, + includeSubdomains: req.body.includeSubdomains, + }); + + let mappedLinks = mapResults.links.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`); + + // Filter by path prefix if present + if (pathPrefix) { + mappedLinks = mappedLinks.filter(x => x.includes(`/${pathPrefix}/`)); + } + + if (req.body.prompt) { + const linksAndScores = await performRanking(mappedLinks, mapUrl); + mappedLinks = linksAndScores + .filter(x => x.score > SCORE_THRESHOLD) + .map(x => x.link.split("url: ")[1].split(",")[0]) + .filter(x => !isUrlBlocked(x)) + .slice(0, MAX_RANKING_LIMIT); + } + + links.push(...mappedLinks); + + } else { + // Handle direct URLs without glob pattern + if (!isUrlBlocked(url)) { + links.push(url); + } + } + } + + // Scrape each link for (const url of links) { const origin = req.body.origin || "api"; const timeout = req.body.timeout ?? 30000; const jobId = crypto.randomUUID(); - const startTime = new Date().getTime(); const jobPriority = await getJobPriority({ plan: req.auth.plan as PlanType, team_id: req.auth.team_id, @@ -223,11 +122,11 @@ export async function extractController( jobPriority ); - const totalWait = 0 //60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0); + const totalWait = 0; let doc: Document; try { - doc = await waitForJob(jobId, timeout + totalWait); // TODO: better types for this + doc = await waitForJob(jobId, timeout + totalWait); } catch (e) { logger.error(`Error in scrapeController: ${e}`); if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) { @@ -245,36 +144,24 @@ export async function extractController( await getScrapeQueue().remove(jobId); - // const endTime = new Date().getTime(); - // const timeTakenInSeconds = (endTime - startTime) / 1000; - // const numTokens = - // doc && doc.extract - // // ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") - // ? 0 // TODO: fix - // : 0; - - let creditsToBeBilled = 1; // Assuming 1 credit per document if (earlyReturn) { - // Don't bill if we're early returning return; } docs.push(doc); } - console.log(docs) - const completions = await generateOpenAICompletions( logger.child({ method: "extractController/generateOpenAICompletions" }), { mode: "llm", systemPrompt: "Only use the provided content to answer the question.", - prompt: mapUrl, + prompt: req.body.prompt, schema: req.body.schema, }, docs.map(x => x.markdown).join('\n') ); - console.log("completions", completions); + // console.log("completions", completions); // if(req.body.extract && req.body.formats.includes("extract")) { // creditsToBeBilled = 5; @@ -355,7 +242,7 @@ export async function extractController( return res.status(200).json({ success: true, - data: data, // includeMetadata ? mapResults : linksToReturn, - scrape_id: id, //origin?.includes("website") ? id : undefined, + data: data, + scrape_id: id, }); } \ No newline at end of file diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 91d712de..f2e9453a 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -29,6 +29,14 @@ const MAX_MAP_LIMIT = 5000; // Max Links that "Smart /map" can return const MAX_FIRE_ENGINE_RESULTS = 1000; +interface MapResult { + success: boolean; + links: string[] | any[]; + scrape_id?: string; + job_id: string; + time_taken: number; +} + export async function getMapResults({ url, search, @@ -39,8 +47,8 @@ export async function getMapResults({ teamId, plan, origin, - subId, - includeMetadata = false + includeMetadata = false, + allowExternalLinks }: { url: string; search?: string; @@ -51,9 +59,9 @@ export async function getMapResults({ teamId: string; plan?: string; origin?: string; - subId: string | null; includeMetadata?: boolean; -}) { + allowExternalLinks?: boolean; +}): Promise { const id = uuidv4(); let links: string[] = [url]; @@ -74,10 +82,11 @@ export async function getMapResults({ let urlWithoutWww = url.replace("www.", ""); - let mapUrl = search - ? `"${search}" site:${urlWithoutWww}` + let mapUrl = search && allowExternalLinks + ? `${search} ${urlWithoutWww}` + : search ? `${search} site:${urlWithoutWww}` : `site:${url}`; - + const resultsPerPage = 100; const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage); @@ -171,34 +180,14 @@ export async function getMapResults({ // remove duplicates that could be due to http/https or www links = removeDuplicateUrls(links); - billTeam(teamId, subId, 1).catch((error) => { - logger.error( - `Failed to bill team ${teamId} for 1 credit: ${error}` - ); - }); - const linksToReturn = links.slice(0, limit); - logJob({ - job_id: id, - success: links.length > 0, - message: "Map completed", - num_docs: linksToReturn.length, - docs: linksToReturn, - time_taken: (new Date().getTime() - Date.now()) / 1000, - team_id: teamId, - mode: "map", - url: url, - crawlerOptions: {}, - scrapeOptions: {}, - origin: origin ?? "api", - num_tokens: 0, - }); - return { success: true, links: includeMetadata ? mapResults : linksToReturn, scrape_id: origin?.includes("website") ? id : undefined, + job_id: id, + time_taken: (new Date().getTime() - Date.now()) / 1000, }; } @@ -208,7 +197,6 @@ export async function mapController( ) { req.body = mapRequestSchema.parse(req.body); - console.log("req.body", req.body); const result = await getMapResults({ url: req.body.url, search: req.body.search, @@ -216,10 +204,33 @@ export async function mapController( ignoreSitemap: req.body.ignoreSitemap, includeSubdomains: req.body.includeSubdomains, crawlerOptions: req.body, + origin: req.body.origin, teamId: req.auth.team_id, plan: req.auth.plan, - origin: req.body.origin, - subId: req.acuc?.sub_id + }); + + // Bill the team + billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => { + logger.error( + `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}` + ); + }); + + // Log the job + logJob({ + job_id: result.job_id, + success: result.links.length > 0, + message: "Map completed", + num_docs: result.links.length, + docs: result.links, + time_taken: result.time_taken, + team_id: req.auth.team_id, + mode: "map", + url: req.body.url, + crawlerOptions: {}, + scrapeOptions: {}, + origin: req.body.origin ?? "api", + num_tokens: 0, }); const response = {