Nick: refactor and /* glob pattern support

2025-09-20 14:00:53 +00:00 · 2024-11-14 14:57:38 -05:00 · 2024-11-14 14:57:38 -05:00 · d6749c211d
commit d6749c211d
parent 3d6d650f0b
2 changed files with 105 additions and 207 deletions
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@ -9,15 +9,9 @@ import {
  scrapeOptions,
 } from "./types";
 import { Document } from "../../lib/entities";
 import { StoredCrawl, crawlToCrawler } from "../../lib/crawl-redis";
 import { fireEngineMap } from "../../search/fireEngine";
 import Redis from "ioredis";
 import { configDotenv } from "dotenv";
 import { performRanking } from "../../lib/ranker";
 import { checkAndUpdateURLForMap } from "../../lib/validateUrl";
 import { isSameDomain } from "../../lib/validateUrl";
 import { isSameSubdomain } from "../../lib/validateUrl";
 import { removeDuplicateUrls } from "../../lib/validateUrl";
 import { billTeam } from "../../services/billing/credit_billing";
 import { logJob } from "../../services/logging/log_job";
 import { logger } from "../../lib/logger";
@ -28,6 +22,7 @@ import { PlanType } from "../../types";
 import { getJobPriority } from "../../lib/job-priority";
 import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
 import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
 import { getMapResults } from "./map";
 configDotenv();
 const redis = new Redis(process.env.REDIS_URL!);
@ -43,164 +38,68 @@ export async function extractController(
  req.body = extractRequestSchema.parse(req.body);
  const id = crypto.randomUUID();
-  let links: string[]; //= req.body.urls;
+  let links: string[] = [];
  let docs: Document[] = [];
  const earlyReturn = false;
-  const sc: StoredCrawl = {
+  for (const url of req.body.urls) {
-    originUrl: req.body.urls[0],
+    if (url.includes('/*')) {
-    crawlerOptions: {
+      // Handle glob pattern URLs
-      // ...crawlerOptions,
+      const baseUrl = url.replace('/*', '');
-      scrapeOptions: undefined,
+      const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
    },
    scrapeOptions: scrapeOptions.parse({}),
    internalOptions: {},
    team_id: req.auth.team_id,
    createdAt: Date.now(),
    plan: req.auth.plan!,
  };
  const crawler = crawlToCrawler(id, sc);
  let urlWithoutWww = req.body.urls[0].replace("www.", "");
  console.log("urlWithoutWww", urlWithoutWww);
  const allowExternalLinks = req.body.allowExternalLinks ?? false;
      const allowExternalLinks = req.body.allowExternalLinks ?? true;
      let urlWithoutWww = baseUrl.replace("www.", "");
      let mapUrl = req.body.prompt && allowExternalLinks
        ? `${req.body.prompt} ${urlWithoutWww}`
        : req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
        : `site:${urlWithoutWww}`;
-  const resultsPerPage = 100;
+      const mapResults = await getMapResults({
-  const maxPages = Math.ceil(MAX_EXTRACT_LIMIT / resultsPerPage);
+        url: baseUrl,
-
+        search: req.body.prompt,
-  const cacheKey = `fireEngineMap:${mapUrl}`;
+        teamId: req.auth.team_id,
-  const cachedResult = null;
+        plan: req.auth.plan,
-
+        allowExternalLinks,
-  let allResults: any[] = [];
+        origin: req.body.origin,
-  let pagePromises: Promise<any>[] = [];
+        limit: req.body.limit,
-
+        ignoreSitemap: false,
-  if (cachedResult) {
+        includeMetadata: true,
-    allResults = JSON.parse(cachedResult);
+        includeSubdomains: req.body.includeSubdomains,
  } else {
    const fetchPage = async (page: number) => {
      return fireEngineMap(mapUrl, {
        numResults: resultsPerPage,
        page: page,
      });
    };
-    pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
+      let mappedLinks = mapResults.links.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
-    allResults = (await Promise.all(pagePromises)).flat();
+      
-    // console.log("allResults", allResults);
+      // Filter by path prefix if present
-    // if allResults is empty, return an error
+      if (pathPrefix) {
-    if (allResults.length === 0) {
+        mappedLinks = mappedLinks.filter(x => x.includes(`/${pathPrefix}/`));
      return res.status(400).json({
        success: false,
        error: "No results found",
      });
      }
    await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
  }
  // console.log("allResults", allResults);
  // Parallelize sitemap fetch with serper search
  // const [sitemap, ...searchResults] = await Promise.all([
  //   req.body.ignoreSitemap ? null : null, // crawler.tryGetSitemap(),
  //   ...(cachedResult ? [] : pagePromises),
  // ]);
  // if (!cachedResult) {
  //   allResults = searchResults;
  // }
  links = allResults.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
  console.log("links", links);
  // if (sitemap !== null) {
  //   sitemap.forEach((x) => {
  //     links.push(x.url);
  //   });
  // }
  // let mapResults = allResults
  //   .flat()
  //   .filter((result) => result !== null && result !== undefined);
  // const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT);
  // if (mapResults.length > minumumCutoff) {
  //   mapResults = mapResults.slice(0, minumumCutoff);
  // }
  // if (mapResults.length > 0) {
  //   if (req.body.prompt) {
  //     // Ensure all map results are first, maintaining their order
  //     links = [
  //       mapResults[0].url,
  //       ...mapResults.slice(1).map((x) => x.url),
  //       ...links,
  //     ];
  //   } else {
  //     mapResults.map((x) => {
  //       links.push(x.url);
  //     });
  //   }
  // }
  // console.log("mapResults", mapResults);
  // console.log("links", links);
  let linksAndScores: { link: string; score: number }[] = [];
  // Perform cosine similarity between the search query and the list of links
      if (req.body.prompt) {
-    const searchQuery = mapUrl; //req.body.prompt.toLowerCase();
+        const linksAndScores = await performRanking(mappedLinks, mapUrl);
-    linksAndScores = await performRanking(links, searchQuery);
+        mappedLinks = linksAndScores
  }
  console.log("linksAndScores", linksAndScores);
  links = linksAndScores
          .filter(x => x.score > SCORE_THRESHOLD)
          .map(x => x.link.split("url: ")[1].split(",")[0])
          .filter(x => !isUrlBlocked(x))
          .slice(0, MAX_RANKING_LIMIT);
      }
-  console.log("links:", links.length);
+      links.push(...mappedLinks);
-  // should we use some sort of llm to determine the best links?
+    } else {
-
+      // Handle direct URLs without glob pattern
-  // console.log("linksAndScores", linksAndScores);
+      if (!isUrlBlocked(url)) {
-
+        links.push(url);
-  // links = links
+      }
-  //   .map((x) => {
+    }
-  //     try {
+  }
  //       return checkAndUpdateURLForMap(x).url.trim();
  //     } catch (_) {
  //       return null;
  //     }
  //   })
  //   .filter((x) => x !== null) as string[];
  // allows for subdomains to be included
  // links = links.filter((x) => isSameDomain(x, req.body.urls[0]));
  // if includeSubdomains is false, filter out subdomains
  // if (!req.body.includeSubdomains) {
  //   links = links.filter((x) => isSameSubdomain(x, req.body.urls[0]));
  // z}
  // remove duplicates that could be due to http/https or www
  // links = removeDuplicateUrls(links);
  // get top N links
  links = links.slice(0, MAX_RANKING_LIMIT);
  // scrape the links
  let earlyReturn = false;
  let docs: Document[] = [];
  // Scrape each link
  for (const url of links) {
    const origin = req.body.origin || "api";
    const timeout = req.body.timeout ?? 30000;
    const jobId = crypto.randomUUID();
    const startTime = new Date().getTime();
    const jobPriority = await getJobPriority({
      plan: req.auth.plan as PlanType,
      team_id: req.auth.team_id,
@ -223,11 +122,11 @@ export async function extractController(
      jobPriority
    );
-    const totalWait = 0 //60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
+    const totalWait = 0;
    let doc: Document;
    try {
-      doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
+      doc = await waitForJob<Document>(jobId, timeout + totalWait);
    } catch (e) {
      logger.error(`Error in scrapeController: ${e}`);
      if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
@ -245,36 +144,24 @@ export async function extractController(
    await getScrapeQueue().remove(jobId);
    // const endTime = new Date().getTime();
    // const timeTakenInSeconds = (endTime - startTime) / 1000;
    // const numTokens =
    //   doc && doc.extract
    //     // ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
    //   ? 0 // TODO: fix
    //   : 0;
    let creditsToBeBilled = 1; // Assuming 1 credit per document
    if (earlyReturn) {
      // Don't bill if we're early returning
      return;
    }
    docs.push(doc);
  }
  console.log(docs)
  const completions = await generateOpenAICompletions(
    logger.child({ method: "extractController/generateOpenAICompletions" }),
    {
      mode: "llm",
      systemPrompt: "Only use the provided content to answer the question.",
-      prompt: mapUrl,
+      prompt: req.body.prompt,
      schema: req.body.schema,
    },
    docs.map(x => x.markdown).join('\n')
  );
-  console.log("completions", completions);
+  // console.log("completions", completions);
  // if(req.body.extract && req.body.formats.includes("extract")) {
  //   creditsToBeBilled = 5;
@ -355,7 +242,7 @@ export async function extractController(
  return res.status(200).json({
    success: true,
-    data: data, // includeMetadata ? mapResults : linksToReturn,
+    data: data,
-    scrape_id: id, //origin?.includes("website") ? id : undefined,
+    scrape_id: id,
  });
 }
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -29,6 +29,14 @@ const MAX_MAP_LIMIT = 5000;
 // Max Links that "Smart /map" can return
 const MAX_FIRE_ENGINE_RESULTS = 1000;
 interface MapResult {
  success: boolean;
  links: string[] | any[];
  scrape_id?: string;
  job_id: string;
  time_taken: number;
 }
 export async function getMapResults({
  url,
  search,
@ -39,8 +47,8 @@ export async function getMapResults({
  teamId,
  plan,
  origin,
-  subId,
+  includeMetadata = false,
-  includeMetadata = false
+  allowExternalLinks
 }: {
  url: string;
  search?: string;
@ -51,9 +59,9 @@ export async function getMapResults({
  teamId: string;
  plan?: string;
  origin?: string;
  subId: string | null;
  includeMetadata?: boolean;
-}) {
+  allowExternalLinks?: boolean;
 }): Promise<MapResult> {
  const id = uuidv4();
  let links: string[] = [url];
@ -74,8 +82,9 @@ export async function getMapResults({
  let urlWithoutWww = url.replace("www.", "");
-  let mapUrl = search
+  let mapUrl = search && allowExternalLinks
-    ? `"${search}" site:${urlWithoutWww}`
+    ? `${search} ${urlWithoutWww}`
    : search ? `${search} site:${urlWithoutWww}`
    : `site:${url}`;
  const resultsPerPage = 100;
@ -171,34 +180,14 @@ export async function getMapResults({
  // remove duplicates that could be due to http/https or www
  links = removeDuplicateUrls(links);
  billTeam(teamId, subId, 1).catch((error) => {
    logger.error(
      `Failed to bill team ${teamId} for 1 credit: ${error}`
    );
  });
  const linksToReturn = links.slice(0, limit);
  logJob({
    job_id: id,
    success: links.length > 0,
    message: "Map completed", 
    num_docs: linksToReturn.length,
    docs: linksToReturn,
    time_taken: (new Date().getTime() - Date.now()) / 1000,
    team_id: teamId,
    mode: "map",
    url: url,
    crawlerOptions: {},
    scrapeOptions: {},
    origin: origin ?? "api",
    num_tokens: 0,
  });
  return {
    success: true,
    links: includeMetadata ? mapResults : linksToReturn,
    scrape_id: origin?.includes("website") ? id : undefined,
    job_id: id,
    time_taken: (new Date().getTime() - Date.now()) / 1000,
  };
 }
@ -208,7 +197,6 @@ export async function mapController(
 ) {
  req.body = mapRequestSchema.parse(req.body);
  console.log("req.body", req.body);
  const result = await getMapResults({
    url: req.body.url,
    search: req.body.search,
@ -216,10 +204,33 @@ export async function mapController(
    ignoreSitemap: req.body.ignoreSitemap,
    includeSubdomains: req.body.includeSubdomains,
    crawlerOptions: req.body,
    origin: req.body.origin,
    teamId: req.auth.team_id,
    plan: req.auth.plan,
-    origin: req.body.origin,
+  });
-    subId: req.acuc?.sub_id
+
  // Bill the team
  billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
    logger.error(
      `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
    );
  });
  // Log the job
  logJob({
    job_id: result.job_id,
    success: result.links.length > 0,
    message: "Map completed",
    num_docs: result.links.length,
    docs: result.links,
    time_taken: result.time_taken,
    team_id: req.auth.team_id,
    mode: "map", 
    url: req.body.url,
    crawlerOptions: {},
    scrapeOptions: {},
    origin: req.body.origin ?? "api",
    num_tokens: 0,
  });
  const response = {