Nick: init

2025-12-17 18:25:51 +00:00 · 2024-12-26 12:21:46 -03:00 · 2024-12-26 12:21:46 -03:00 · f467a3ae6c
commit f467a3ae6c
parent c911aad228
2 changed files with 241 additions and 120 deletions
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@ -7,6 +7,7 @@ import {
  ExtractResponse,
  MapDocument,
  scrapeOptions,
  URLTrace,
 } from "./types";
 // import { Document } from "../../lib/entities";
 import Redis from "ioredis";
@ -56,14 +57,22 @@ export async function extractController(
  let links: string[] = [];
  let docs: Document[] = [];
  const earlyReturn = false;
  const urlTraces: URLTrace[] = [];
  // Process all URLs in parallel
  const urlPromises = req.body.urls.map(async (url) => {
    const trace: URLTrace = {
      url,
      status: 'mapped',
      timing: {
        discoveredAt: new Date().toISOString(),
      },
    };
    urlTraces.push(trace);
    if (url.includes("/*") || req.body.allowExternalLinks) {
      // Handle glob pattern URLs
      const baseUrl = url.replace("/*", "");
      // const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
      const allowExternalLinks = req.body.allowExternalLinks;
      let urlWithoutWww = baseUrl.replace("www.", "");
@ -75,6 +84,7 @@ export async function extractController(
          )) ?? req.body.prompt;
      }
      try {
        const mapResults = await getMapResults({
          url: baseUrl,
          search: rephrasedPrompt,
@ -83,7 +93,6 @@ export async function extractController(
          allowExternalLinks,
          origin: req.body.origin,
          limit: req.body.limit,
        // If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
          ignoreSitemap: false,
          includeMetadata: true,
          includeSubdomains: req.body.includeSubdomains,
@ -95,6 +104,20 @@ export async function extractController(
        const allUrls = [...mappedLinks.map((m) => m.url), ...mapResults.links];
        const uniqueUrls = removeDuplicateUrls(allUrls);
        // Track all discovered URLs
        uniqueUrls.forEach(discoveredUrl => {
          if (!urlTraces.some(t => t.url === discoveredUrl)) {
            urlTraces.push({
              url: discoveredUrl,
              status: 'mapped',
              timing: {
                discoveredAt: new Date().toISOString(),
              },
              usedInCompletion: false, // Default to false, will update if used
            });
          }
        });
        // Only add URLs from mapResults.links that aren't already in mappedLinks
        const existingUrls = new Set(mappedLinks.map((m) => m.url));
        const newUrls = uniqueUrls.filter((url) => !existingUrls.has(url));
@ -166,22 +189,62 @@ export async function extractController(
            }
          }
          // Update URL traces with relevance scores and mark filtered out URLs
          linksAndScores.forEach((score) => {
            const trace = urlTraces.find((t) => t.url === score.link);
            if (trace) {
              trace.relevanceScore = score.score;
              // If URL didn't make it through filtering, mark it as filtered out
              if (!filteredLinks.some(link => link.url === score.link)) {
                trace.warning = `Relevance score ${score.score} below threshold`;
                trace.usedInCompletion = false;
              }
            }
          });
          mappedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
          // Mark URLs that will be used in completion
          mappedLinks.forEach(link => {
            const trace = urlTraces.find(t => t.url === link.url);
            if (trace) {
              trace.usedInCompletion = true;
            }
          });
          // Mark URLs that were dropped due to ranking limit
          filteredLinks.slice(MAX_RANKING_LIMIT).forEach(link => {
            const trace = urlTraces.find(t => t.url === link.url);
            if (trace) {
              trace.warning = 'Excluded due to ranking limit';
              trace.usedInCompletion = false;
            }
          });
        }
-      return mappedLinks.map((x) => x.url) as string[];
+        return mappedLinks.map((x) => x.url);
      } catch (error) {
        trace.status = 'error';
        trace.error = error.message;
        trace.usedInCompletion = false;
        return [];
      }
    } else {
      // Handle direct URLs without glob pattern
      if (!isUrlBlocked(url)) {
        trace.usedInCompletion = true;
        return [url];
      }
      trace.status = 'error';
      trace.error = 'URL is blocked';
      trace.usedInCompletion = false;
      return [];
    }
  });
  // Wait for all URL processing to complete and flatten results
  const processedUrls = await Promise.all(urlPromises);
-  const flattenedUrls = processedUrls.flat().filter((url) => url); // Filter out any null/undefined values
+  const flattenedUrls = processedUrls.flat().filter((url) => url);
  links.push(...flattenedUrls);
  if (links.length === 0) {
@ -189,13 +252,20 @@ export async function extractController(
      success: false,
      error:
        "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs.",
      urlTrace: urlTraces,
    });
  }
  // Scrape all links in parallel with retries
  const scrapePromises = links.map(async (url) => {
    const trace = urlTraces.find((t) => t.url === url);
    if (trace) {
      trace.status = 'scraped';
      trace.timing.scrapedAt = new Date().toISOString();
    }
    const origin = req.body.origin || "api";
-    const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000; // Use 70% of total timeout for individual scrapes
+    const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000;
    const jobId = crypto.randomUUID();
    const jobPriority = await getJobPriority({
@ -204,6 +274,7 @@ export async function extractController(
      basePriority: 10,
    });
    try {
      await addScrapeJob(
        {
          url,
@ -220,15 +291,28 @@ export async function extractController(
        jobPriority,
      );
    try {
      const doc = await waitForJob<Document>(jobId, timeout);
      await getScrapeQueue().remove(jobId);
      if (trace) {
        trace.timing.completedAt = new Date().toISOString();
        trace.contentStats = {
          rawContentLength: doc.markdown?.length || 0,
          processedContentLength: doc.markdown?.length || 0,
          tokensUsed: 0, // Will be updated after LLM processing
        };
      }
      if (earlyReturn) {
        return null;
      }
      return doc;
    } catch (e) {
      logger.error(`Error in extractController: ${e}`);
      if (trace) {
        trace.status = 'error';
        trace.error = e.message;
      }
      return null;
    }
  });
@ -240,6 +324,7 @@ export async function extractController(
    return res.status(e.status).json({
      success: false,
      error: e.error,
      urlTrace: urlTraces,
    });
  }
@ -256,9 +341,25 @@ export async function extractController(
    },
    docs.map((x) => buildDocument(x)).join("\n"),
    undefined,
-    true, // isExtractEndpoint
+    true,
  );
  // Update token usage in URL traces
  if (completions.numTokens) {
    // Distribute tokens proportionally based on content length
    const totalLength = docs.reduce((sum, doc) => sum + (doc.markdown?.length || 0), 0);
    docs.forEach((doc) => {
      if (doc.metadata?.sourceURL) {
        const trace = urlTraces.find((t) => t.url === doc.metadata.sourceURL);
        if (trace && trace.contentStats) {
          trace.contentStats.tokensUsed = Math.floor(
            ((doc.markdown?.length || 0) / totalLength) * completions.numTokens
          );
        }
      }
    });
  }
  // TODO: change this later
  // While on beta, we're billing 5 credits per link discovered/scraped.
  billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(
@ -292,6 +393,7 @@ export async function extractController(
    data: data,
    scrape_id: id,
    warning: warning,
    urlTrace: urlTraces,
  });
 }
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -379,16 +379,16 @@ export type MapRequest = z.infer<typeof mapRequestSchema>;
 export type Document = {
  markdown?: string;
  extract?: any;
  html?: string;
  rawHtml?: string;
  links?: string[];
  screenshot?: string;
  extract?: any;
  warning?: string;
  actions?: {
    screenshots?: string[];
    scrapes?: ScrapeActionContent[];
  };
  warning?: string;
  metadata: {
    title?: string;
    description?: string;
@ -425,7 +425,7 @@ export type Document = {
    error?: string;
    [key: string]: string | string[] | number | undefined;
  };
-};
+}
 export type ErrorResponse = {
  success: false;
@ -448,14 +448,33 @@ export interface ScrapeResponseRequestTest {
  error?: string;
 }
-export type ExtractResponse =
+export interface URLTrace {
-  | ErrorResponse
+  url: string;
-  | {
+  status: 'mapped' | 'scraped' | 'error';
-      success: true;
+  timing: {
-      warning?: string;
+    discoveredAt: string;
-      data: z.infer<typeof extractRequestSchema>;
+    scrapedAt?: string;
-      scrape_id?: string;
+    completedAt?: string;
  };
  error?: string;
  warning?: string;
  contentStats?: {
    rawContentLength: number;
    processedContentLength: number;
    tokensUsed: number;
  };
  relevanceScore?: number;
  usedInCompletion?: boolean;
 }
 export interface ExtractResponse {
  success: boolean;
  data?: any;
  scrape_id?: string;
  warning?: string;
  error?: string;
  urlTrace?: URLTrace[];
 }
 export interface ExtractResponseRequestTest {
  statusCode: number;