Nick: refactor and /* glob pattern support

2025-11-18 03:07:52 +00:00 · 2024-11-14 14:57:38 -05:00 · 2024-11-14 14:57:38 -05:00 · d6749c211d
commit d6749c211d
parent 3d6d650f0b
2 changed files with 105 additions and 207 deletions
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@ -9,15 +9,9 @@ import {
  scrapeOptions,
 } from "./types";
 import { Document } from "../../lib/entities";
-import { StoredCrawl, crawlToCrawler } from "../../lib/crawl-redis";
-import { fireEngineMap } from "../../search/fireEngine";
 import Redis from "ioredis";
 import { configDotenv } from "dotenv";
 import { performRanking } from "../../lib/ranker";
-import { checkAndUpdateURLForMap } from "../../lib/validateUrl";
-import { isSameDomain } from "../../lib/validateUrl";
-import { isSameSubdomain } from "../../lib/validateUrl";
-import { removeDuplicateUrls } from "../../lib/validateUrl";
 import { billTeam } from "../../services/billing/credit_billing";
 import { logJob } from "../../services/logging/log_job";
 import { logger } from "../../lib/logger";
@ -28,6 +22,7 @@ import { PlanType } from "../../types";
 import { getJobPriority } from "../../lib/job-priority";
 import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
 import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
+import { getMapResults } from "./map";

 configDotenv();
 const redis = new Redis(process.env.REDIS_URL!);
@ -43,164 +38,68 @@ export async function extractController(
  req.body = extractRequestSchema.parse(req.body);

  const id = crypto.randomUUID();
-  let links: string[]; //= req.body.urls;
-
-  const sc: StoredCrawl = {
-    originUrl: req.body.urls[0],
-    crawlerOptions: {
-      // ...crawlerOptions,
-      scrapeOptions: undefined,
-    },
-    scrapeOptions: scrapeOptions.parse({}),
-    internalOptions: {},
-    team_id: req.auth.team_id,
-    createdAt: Date.now(),
-    plan: req.auth.plan!,
-  };
-
-  const crawler = crawlToCrawler(id, sc);
-
-  let urlWithoutWww = req.body.urls[0].replace("www.", "");
-  console.log("urlWithoutWww", urlWithoutWww);
-
-  const allowExternalLinks = req.body.allowExternalLinks ?? false;
-
-  let mapUrl = req.body.prompt && allowExternalLinks
-    ? `${req.body.prompt} ${urlWithoutWww}`
-    : req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
-    : `site:${urlWithoutWww}`;
-
-  const resultsPerPage = 100;
-  const maxPages = Math.ceil(MAX_EXTRACT_LIMIT / resultsPerPage);
-
-  const cacheKey = `fireEngineMap:${mapUrl}`;
-  const cachedResult = null;
-
-  let allResults: any[] = [];
-  let pagePromises: Promise<any>[] = [];
-
-  if (cachedResult) {
-    allResults = JSON.parse(cachedResult);
-  } else {
-    const fetchPage = async (page: number) => {
-      return fireEngineMap(mapUrl, {
-        numResults: resultsPerPage,
-        page: page,
-      });
-    };
-
-    pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
-    allResults = (await Promise.all(pagePromises)).flat();
-    // console.log("allResults", allResults);
-    // if allResults is empty, return an error
-    if (allResults.length === 0) {
-      return res.status(400).json({
-        success: false,
-        error: "No results found",
-      });
-    }
-
-    await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
-  }
-
-  // console.log("allResults", allResults);
-  // Parallelize sitemap fetch with serper search
-  // const [sitemap, ...searchResults] = await Promise.all([
-  //   req.body.ignoreSitemap ? null : null, // crawler.tryGetSitemap(),
-  //   ...(cachedResult ? [] : pagePromises),
-  // ]);
-
-  // if (!cachedResult) {
-  //   allResults = searchResults;
-  // }
-
-  links = allResults.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
-  console.log("links", links);
-  // if (sitemap !== null) {
-  //   sitemap.forEach((x) => {
-  //     links.push(x.url);
-  //   });
-  // }
-
-  // let mapResults = allResults
-  //   .flat()
-  //   .filter((result) => result !== null && result !== undefined);
-
-  // const minumumCutoff = Math.min(MAX_EXTRACT_LIMIT, req.body.limit ?? MAX_EXTRACT_LIMIT);
-  // if (mapResults.length > minumumCutoff) {
-  //   mapResults = mapResults.slice(0, minumumCutoff);
-  // }
-
-  // if (mapResults.length > 0) {
-  //   if (req.body.prompt) {
-  //     // Ensure all map results are first, maintaining their order
-  //     links = [
-  //       mapResults[0].url,
-  //       ...mapResults.slice(1).map((x) => x.url),
-  //       ...links,
-  //     ];
-  //   } else {
-  //     mapResults.map((x) => {
-  //       links.push(x.url);
-  //     });
-  //   }
-  // }
-
-  // console.log("mapResults", mapResults);
-
-  // console.log("links", links);
-  let linksAndScores: { link: string; score: number }[] = [];
-  // Perform cosine similarity between the search query and the list of links
-  if (req.body.prompt) {
-    const searchQuery = mapUrl; //req.body.prompt.toLowerCase();
-    linksAndScores = await performRanking(links, searchQuery);
-  }
-  console.log("linksAndScores", linksAndScores);
-  links = linksAndScores
-    .filter(x => x.score > SCORE_THRESHOLD)
-    .map(x => x.link.split("url: ")[1].split(",")[0])
-    .filter(x => !isUrlBlocked(x))
-
-  console.log("links:", links.length);
-
-  // should we use some sort of llm to determine the best links?
-
-  // console.log("linksAndScores", linksAndScores);
-
-  // links = links
-  //   .map((x) => {
-  //     try {
-  //       return checkAndUpdateURLForMap(x).url.trim();
-  //     } catch (_) {
-  //       return null;
-  //     }
-  //   })
-  //   .filter((x) => x !== null) as string[];
-
-  // allows for subdomains to be included
-  // links = links.filter((x) => isSameDomain(x, req.body.urls[0]));
-
-  // if includeSubdomains is false, filter out subdomains
-  // if (!req.body.includeSubdomains) {
-  //   links = links.filter((x) => isSameSubdomain(x, req.body.urls[0]));
-  // z}
-
-  // remove duplicates that could be due to http/https or www
-  // links = removeDuplicateUrls(links);
-
-  // get top N links
-  links = links.slice(0, MAX_RANKING_LIMIT);
-
-  // scrape the links
-  let earlyReturn = false;
+  let links: string[] = [];
  let docs: Document[] = [];
+  const earlyReturn = false;

+  for (const url of req.body.urls) {
+    if (url.includes('/*')) {
+      // Handle glob pattern URLs
+      const baseUrl = url.replace('/*', '');
+      const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
+
+      const allowExternalLinks = req.body.allowExternalLinks ?? true;
+      let urlWithoutWww = baseUrl.replace("www.", "");
+      let mapUrl = req.body.prompt && allowExternalLinks
+        ? `${req.body.prompt} ${urlWithoutWww}`
+        : req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
+        : `site:${urlWithoutWww}`;
+
+      const mapResults = await getMapResults({
+        url: baseUrl,
+        search: req.body.prompt,
+        teamId: req.auth.team_id,
+        plan: req.auth.plan,
+        allowExternalLinks,
+        origin: req.body.origin,
+        limit: req.body.limit,
+        ignoreSitemap: false,
+        includeMetadata: true,
+        includeSubdomains: req.body.includeSubdomains,
+      });
+
+      let mappedLinks = mapResults.links.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
+      
+      // Filter by path prefix if present
+      if (pathPrefix) {
+        mappedLinks = mappedLinks.filter(x => x.includes(`/${pathPrefix}/`));
+      }
+
+      if (req.body.prompt) {
+        const linksAndScores = await performRanking(mappedLinks, mapUrl);
+        mappedLinks = linksAndScores
+          .filter(x => x.score > SCORE_THRESHOLD)
+          .map(x => x.link.split("url: ")[1].split(",")[0])
+          .filter(x => !isUrlBlocked(x))
+          .slice(0, MAX_RANKING_LIMIT);
+      }
+
+      links.push(...mappedLinks);
+
+    } else {
+      // Handle direct URLs without glob pattern
+      if (!isUrlBlocked(url)) {
+        links.push(url);
+      }
+    }
+  }
+
+  // Scrape each link
  for (const url of links) {
    const origin = req.body.origin || "api";
    const timeout = req.body.timeout ?? 30000;
    const jobId = crypto.randomUUID();

-    const startTime = new Date().getTime();
    const jobPriority = await getJobPriority({
      plan: req.auth.plan as PlanType,
      team_id: req.auth.team_id,
@ -223,11 +122,11 @@ export async function extractController(
      jobPriority
    );

-    const totalWait = 0 //60000 // (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds ?? 0 : 0) + a, 0);
+    const totalWait = 0;

    let doc: Document;
    try {
-      doc = await waitForJob<Document>(jobId, timeout + totalWait); // TODO: better types for this
+      doc = await waitForJob<Document>(jobId, timeout + totalWait);
    } catch (e) {
      logger.error(`Error in scrapeController: ${e}`);
      if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
@ -245,36 +144,24 @@ export async function extractController(

    await getScrapeQueue().remove(jobId);

-    // const endTime = new Date().getTime();
-    // const timeTakenInSeconds = (endTime - startTime) / 1000;
-    // const numTokens =
-    //   doc && doc.extract
-    //     // ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
-    //   ? 0 // TODO: fix
-    //   : 0;
-
-    let creditsToBeBilled = 1; // Assuming 1 credit per document
    if (earlyReturn) {
-      // Don't bill if we're early returning
      return;
    }
    docs.push(doc);
  }

-  console.log(docs)
-
  const completions = await generateOpenAICompletions(
    logger.child({ method: "extractController/generateOpenAICompletions" }),
    {
      mode: "llm",
      systemPrompt: "Only use the provided content to answer the question.",
-      prompt: mapUrl,
+      prompt: req.body.prompt,
      schema: req.body.schema,
    },
    docs.map(x => x.markdown).join('\n')
  );

-  console.log("completions", completions);
+  // console.log("completions", completions);

  // if(req.body.extract && req.body.formats.includes("extract")) {
  //   creditsToBeBilled = 5;
@ -355,7 +242,7 @@ export async function extractController(

  return res.status(200).json({
    success: true,
-    data: data, // includeMetadata ? mapResults : linksToReturn,
-    scrape_id: id, //origin?.includes("website") ? id : undefined,
+    data: data,
+    scrape_id: id,
  });
 }
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -29,6 +29,14 @@ const MAX_MAP_LIMIT = 5000;
 // Max Links that "Smart /map" can return
 const MAX_FIRE_ENGINE_RESULTS = 1000;

+interface MapResult {
+  success: boolean;
+  links: string[] | any[];
+  scrape_id?: string;
+  job_id: string;
+  time_taken: number;
+}
+
 export async function getMapResults({
  url,
  search,
@ -39,8 +47,8 @@ export async function getMapResults({
  teamId,
  plan,
  origin,
-  subId,
-  includeMetadata = false
+  includeMetadata = false,
+  allowExternalLinks
 }: {
  url: string;
  search?: string;
@ -51,9 +59,9 @@ export async function getMapResults({
  teamId: string;
  plan?: string;
  origin?: string;
-  subId: string | null;
  includeMetadata?: boolean;
-}) {
+  allowExternalLinks?: boolean;
+}): Promise<MapResult> {
  const id = uuidv4();
  let links: string[] = [url];

@ -74,10 +82,11 @@ export async function getMapResults({

  let urlWithoutWww = url.replace("www.", "");

-  let mapUrl = search
-    ? `"${search}" site:${urlWithoutWww}`
+  let mapUrl = search && allowExternalLinks
+    ? `${search} ${urlWithoutWww}`
+    : search ? `${search} site:${urlWithoutWww}`
    : `site:${url}`;
-
+    
  const resultsPerPage = 100;
  const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);

@ -171,34 +180,14 @@ export async function getMapResults({
  // remove duplicates that could be due to http/https or www
  links = removeDuplicateUrls(links);

-  billTeam(teamId, subId, 1).catch((error) => {
-    logger.error(
-      `Failed to bill team ${teamId} for 1 credit: ${error}`
-    );
-  });
-
  const linksToReturn = links.slice(0, limit);

-  logJob({
-    job_id: id,
-    success: links.length > 0,
-    message: "Map completed", 
-    num_docs: linksToReturn.length,
-    docs: linksToReturn,
-    time_taken: (new Date().getTime() - Date.now()) / 1000,
-    team_id: teamId,
-    mode: "map",
-    url: url,
-    crawlerOptions: {},
-    scrapeOptions: {},
-    origin: origin ?? "api",
-    num_tokens: 0,
-  });
-
  return {
    success: true,
    links: includeMetadata ? mapResults : linksToReturn,
    scrape_id: origin?.includes("website") ? id : undefined,
+    job_id: id,
+    time_taken: (new Date().getTime() - Date.now()) / 1000,
  };
 }

@ -208,7 +197,6 @@ export async function mapController(
 ) {
  req.body = mapRequestSchema.parse(req.body);

-  console.log("req.body", req.body);
  const result = await getMapResults({
    url: req.body.url,
    search: req.body.search,
@ -216,10 +204,33 @@ export async function mapController(
    ignoreSitemap: req.body.ignoreSitemap,
    includeSubdomains: req.body.includeSubdomains,
    crawlerOptions: req.body,
+    origin: req.body.origin,
    teamId: req.auth.team_id,
    plan: req.auth.plan,
-    origin: req.body.origin,
-    subId: req.acuc?.sub_id
+  });
+
+  // Bill the team
+  billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
+    logger.error(
+      `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
+    );
+  });
+
+  // Log the job
+  logJob({
+    job_id: result.job_id,
+    success: result.links.length > 0,
+    message: "Map completed",
+    num_docs: result.links.length,
+    docs: result.links,
+    time_taken: result.time_taken,
+    team_id: req.auth.team_id,
+    mode: "map", 
+    url: req.body.url,
+    crawlerOptions: {},
+    scrapeOptions: {},
+    origin: req.body.origin ?? "api",
+    num_tokens: 0,
  });

  const response = {