Nick: wip

2025-11-22 13:12:01 +00:00 · 2024-10-28 16:02:07 -03:00 · 2024-10-28 16:02:07 -03:00 · 78badf8f72
commit 78badf8f72
parent 8a4f4cb9d9
7 changed files with 1772 additions and 140 deletions
--- a/apps/api/package.json
+++ b/apps/api/package.json
@ -53,7 +53,7 @@
    "@bull-board/api": "^5.20.5",
    "@bull-board/express": "^5.20.5",
    "@devil7softwares/pos": "^1.0.2",
-    "@dqbd/tiktoken": "^1.0.13",
+    "@dqbd/tiktoken": "^1.0.17",
    "@hyperdx/node-opentelemetry": "^0.8.1",
    "@logtail/node": "^0.4.12",
    "@nangohq/node": "^0.40.8",
@ -73,6 +73,7 @@
    "cacheable-lookup": "^6.1.0",
    "cheerio": "^1.0.0-rc.12",
    "cohere": "^1.1.1",
    "cohere-ai": "^7.14.0",
    "cors": "^2.8.5",
    "cron-parser": "^4.9.0",
    "date-fns": "^3.6.0",
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@ -0,0 +1,197 @@
 import { Request, Response } from "express";
 import { Logger } from "../../lib/logger";
 import {
  Document,
  legacyDocumentConverter,
  legacyExtractorOptions,
  legacyScrapeOptions,
  RequestWithAuth,
  ExtractRequest,
  extractRequestSchema,
  ExtractResponse,
  legacyCrawlerOptions,
  MapDocument,
 } from "./types";
 import { billTeam } from "../../services/billing/credit_billing";
 import { v4 as uuidv4 } from "uuid";
 import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
 import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
 import { logJob } from "../../services/logging/log_job";
 import { getJobPriority } from "../../lib/job-priority";
 import { PlanType } from "../../types";
 import { getMapResults } from "./map";
 import { rerankDocuments } from "../../lib/extract/reranker";
 import { generateBasicCompletion } from "../../lib/extract/completions";
 export async function extractController(
  req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
  res: Response<ExtractResponse>
 ) {
  req.body = extractRequestSchema.parse(req.body);
  let earlyReturn = false;
  const origin = req.body.origin;
  const timeout = req.body.timeout;
 //   const pageOptions = legacyScrapeOptions(req.body);
 //   const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
  const jobId = uuidv4();
  const startTime = new Date().getTime();
  const jobPriority = await getJobPriority({
    plan: req.auth.plan as PlanType,
    team_id: req.auth.team_id,
    basePriority: 10,
  });
  const urls = req.body.urls;
  const mappedDocuments: MapDocument[] = [];
  const prompt = req.body.prompt;
  const keywords = await generateBasicCompletion(`If the user's prompt is: "${prompt}", what are the most important keywords besides the extraction task? Output only the keywords, separated by commas.`);
  for (const url of urls) {
    if (url.endsWith("/*")) {
      const mapResults = await getMapResults({
        url: url.slice(0, -2),
        search: req.body.prompt,
        limit: 100,
        ignoreSitemap: true,
        includeSubdomains: false,
        crawlerOptions: {},
        teamId: req.auth.team_id,
        plan: req.auth.plan,
        origin: req.body.origin,
        subId: req.acuc?.sub_id,
        includeMetadata: true
      });
      // top 3 links 
      const top3Links = (mapResults.links as MapDocument[]).slice(0, 3);
      console.log(top3Links);
    //   console.log(top3Links);
      mappedDocuments.push(...(mapResults.links as MapDocument[]));
       // transform mappedUrls to just documents
  // we quickly rerank
      const rerank = await rerankDocuments(mappedDocuments.map(x => `URL: ${x.url}\nTITLE: ${x.title}\nDESCRIPTION: ${x.description}`), "What URLs are most relevant to the following prompt: " + req.body.prompt.toLocaleLowerCase().replace("extract", " ").replace("extract ", " "));
      console.log(rerank);
    } else {
        mappedDocuments.push({ url });
    }
  }
  req.body.urls = mappedDocuments.map(x => x.url);
 //   const job = await addScrapeJob(
 //     {
 //       url: req.body.url,
 //       mode: "single_urls",
 //       crawlerOptions: {},
 //       team_id: req.auth.team_id,
 //       plan: req.auth.plan,
 //       pageOptions,
 //       extractorOptions,
 //       origin: req.body.origin,
 //       is_scrape: true,
 //     },
 //     {},
 //     jobId,
 //     jobPriority
 //   );
 //   const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0);
 //   let doc: any | undefined;
 //   try {
 //     doc = (await waitForJob(job.id, timeout + totalWait))[0];
 //   } catch (e) {
 //     Logger.error(`Error in scrapeController: ${e}`);
 //     if (e instanceof Error && e.message.startsWith("Job wait")) {
 //       return res.status(408).json({
 //         success: false,
 //         error: "Request timed out",
 //       });
 //     } else {
 //       return res.status(500).json({
 //         success: false,
 //         error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
 //           extractorOptions && extractorOptions.mode !== "markdown"
 //             ? " - Could be due to LLM parsing issues"
 //             : ""
 //         }`,
 //       });
 //     }
 //   }
 //   await job.remove();
 //   if (!doc) {
 //     console.error("!!! PANIC DOC IS", doc, job);
 //     return res.status(200).json({
 //       success: true,
 //       warning: "No page found",
 //       data: doc,
 //     });
 //   }
 //   delete doc.index;
 //   delete doc.provider;
 //   const endTime = new Date().getTime();
 //   const timeTakenInSeconds = (endTime - startTime) / 1000;
 //   const numTokens =
 //     doc && doc.markdown
 //       ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
 //       : 0;
 //   let creditsToBeBilled = 1; // Assuming 1 credit per document
 //   if (earlyReturn) {
 //     // Don't bill if we're early returning
 //     return;
 //   }
 //   if(req.body.extract && req.body.formats.includes("extract")) {
 //     creditsToBeBilled = 5;
 //   }
 //   billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
 //     Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
 //     // Optionally, you could notify an admin or add to a retry queue here
 //   });
 //   if (!pageOptions || !pageOptions.includeRawHtml) {
 //     if (doc && doc.rawHtml) {
 //       delete doc.rawHtml;
 //     }
 //   }
 //   if(pageOptions && pageOptions.includeExtract) {
 //     if(!pageOptions.includeMarkdown && doc && doc.markdown) {
 //       delete doc.markdown;
 //     }
 //   }
 //   logJob({
 //     job_id: jobId,
 //     success: true,
 //     message: "Scrape completed",
 //     num_docs: 1,
 //     docs: [doc],
 //     time_taken: timeTakenInSeconds,
 //     team_id: req.auth.team_id,
 //     mode: "scrape",
 //     url: req.body.url,
 //     crawlerOptions: {},
 //     pageOptions: pageOptions,
 //     origin: origin,
 //     extractor_options: extractorOptions,
 //     num_tokens: numTokens,
 //   });
  return res.status(200).json({
    success: true,
    data: null,
    scrape_id: origin?.includes("website") ? jobId : undefined,
  });
 }
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -15,11 +15,11 @@ import {
  removeDuplicateUrls,
 } from "../../lib/validateUrl";
 import { fireEngineMap } from "../../search/fireEngine";
 import { billTeam } from "../../services/billing/credit_billing";
 import { logJob } from "../../services/logging/log_job";
 import { performCosineSimilarity } from "../../lib/map-cosine";
 import { Logger } from "../../lib/logger";
 import Redis from "ioredis";
 import { billTeam } from "../../services/billing/credit_billing";
 import { logJob } from "../../services/logging/log_job";
 configDotenv();
 const redis = new Redis(process.env.REDIS_URL);
@ -29,35 +29,50 @@ const MAX_MAP_LIMIT = 5000;
 // Max Links that "Smart /map" can return
 const MAX_FIRE_ENGINE_RESULTS = 1000;
-export async function mapController(
+interface MapOptions {
-  req: RequestWithAuth<{}, MapResponse, MapRequest>,
+  url: string;
-  res: Response<MapResponse>
+  search?: string;
-) {
+  limit?: number;
  ignoreSitemap?: boolean;
  includeSubdomains?: boolean;
  crawlerOptions?: any;
  teamId: string;
  plan: string;
  origin?: string;
  subId?: string;
  includeMetadata?: boolean;
 }
 export async function getMapResults({
  url,
  search,
  limit = MAX_MAP_LIMIT,
  ignoreSitemap = false,
  includeSubdomains = false,
  crawlerOptions = {},
  teamId,
  plan,
  origin,
  subId,
  includeMetadata = false,
 }: MapOptions) {
  const startTime = new Date().getTime();
  req.body = mapRequestSchema.parse(req.body);
  const limit: number = req.body.limit ?? MAX_MAP_LIMIT;
  const id = uuidv4();
-  let links: string[] = [req.body.url];
+  let links: { url: string; title?: string; description?: string }[] = [{ url }];
  const sc: StoredCrawl = {
-    originUrl: req.body.url,
+    originUrl: url,
-    crawlerOptions: legacyCrawlerOptions(req.body),
+    crawlerOptions,
    pageOptions: {},
-    team_id: req.auth.team_id,
+    team_id: teamId,
    createdAt: Date.now(),
-    plan: req.auth.plan,
+    plan,
  };
  const crawler = crawlToCrawler(id, sc);
-  let urlWithoutWww = req.body.url.replace("www.", "");
+  let urlWithoutWww = url.replace("www.", "");
-
+  let mapUrl = search ? `"${search}" site:${urlWithoutWww}` : `site:${url}`;
  let mapUrl = req.body.search
    ? `"${req.body.search}" site:${urlWithoutWww}`
    : `site:${req.body.url}`;
  const resultsPerPage = 100;
  const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
@ -81,12 +96,11 @@ export async function mapController(
    pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
    allResults = await Promise.all(pagePromises);
-    await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
+    await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60);
  }
  // Parallelize sitemap fetch with serper search
  const [sitemap, ...searchResults] = await Promise.all([
-    req.body.ignoreSitemap ? null : crawler.tryGetSitemap(),
+    ignoreSitemap ? null : crawler.tryGetSitemap(),
    ...(cachedResult ? [] : pagePromises),
  ]);
@ -96,7 +110,7 @@ export async function mapController(
  if (sitemap !== null) {
    sitemap.forEach((x) => {
-      links.push(x.url);
+      links.push({ url: x.url });
    });
  }
@ -110,67 +124,96 @@ export async function mapController(
  }
  if (mapResults.length > 0) {
-    if (req.body.search) {
+    if (search) {
      // Ensure all map results are first, maintaining their order
      links = [
-        mapResults[0].url,
+        { url: mapResults[0].url, title: mapResults[0].title, description: mapResults[0].description },
-        ...mapResults.slice(1).map((x) => x.url),
+        ...mapResults.slice(1).map((x) => ({ 
          url: x.url,
          title: x.title,
          description: x.description
        })),
        ...links,
      ];
    } else {
-      mapResults.map((x) => {
+      mapResults.forEach((x) => {
-        links.push(x.url);
+        links.push({ 
          url: x.url,
          title: x.title,
          description: x.description
        });
      });
    }
  }
-  // Perform cosine similarity between the search query and the list of links
+  if (search) {
-  if (req.body.search) {
+    const filteredLinks = performCosineSimilarity(links.map(l => l.url), search.toLowerCase());
-    const searchQuery = req.body.search.toLowerCase();
+    links = links.filter(l => filteredLinks.includes(l.url));
    links = performCosineSimilarity(links, searchQuery);
  }
  links = links
    .map((x) => {
      try {
-        return checkAndUpdateURLForMap(x).url.trim();
+        return { ...x, url: checkAndUpdateURLForMap(x.url).url.trim() };
      } catch (_) {
        return null;
      }
    })
    .filter((x) => x !== null);
-  // allows for subdomains to be included
+  links = links.filter((x) => isSameDomain(x.url, url));
  links = links.filter((x) => isSameDomain(x, req.body.url));
-  // if includeSubdomains is false, filter out subdomains
+  if (!includeSubdomains) {
-  if (!req.body.includeSubdomains) {
+    links = links.filter((x) => isSameSubdomain(x.url, url));
    links = links.filter((x) => isSameSubdomain(x, req.body.url));
  }
-  // remove duplicates that could be due to http/https or www
+  links = removeDuplicateUrls(links.map(l => l.url)).map(url => links.find(l => l.url === url));
  links = removeDuplicateUrls(links);
  billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
    Logger.error(
      `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
    );
    // Optionally, you could notify an admin or add to a retry queue here
  });
  const endTime = new Date().getTime();
  const timeTakenInSeconds = (endTime - startTime) / 1000;
  const linksToReturn = links.slice(0, limit);
-  logJob({
+  return {
-    job_id: id,
+    links: includeMetadata ? linksToReturn : linksToReturn.map(l => l.url),
-    success: links.length > 0,
+    scrapeId: origin?.includes("website") ? id : undefined,
    timeTakenInSeconds,
    id,
    linksLength: links.length,
    linksToReturnLength: linksToReturn.length,
    docs: linksToReturn.map(l => l.url),
  };
 }
 export async function mapController(
  req: RequestWithAuth<{}, MapResponse, MapRequest>,
  res: Response<MapResponse>
 ) {
  req.body = mapRequestSchema.parse(req.body);
  const results = await getMapResults({
    url: req.body.url,
    search: req.body.search,
    limit: req.body.limit,
    ignoreSitemap: req.body.ignoreSitemap,
    includeSubdomains: req.body.includeSubdomains,
    crawlerOptions: legacyCrawlerOptions(req.body),
    teamId: req.auth.team_id,
    plan: req.auth.plan,
    origin: req.body.origin,
    subId: req.acuc?.sub_id,
  });
  await billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
    Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
  });
  await logJob({
    job_id: results.id,
    success: results.linksLength > 0,
    message: "Map completed",
-    num_docs: linksToReturn.length,
+    num_docs: results.linksToReturnLength,
-    docs: linksToReturn,
+    docs: results.docs,
-    time_taken: timeTakenInSeconds,
+    time_taken: results.timeTakenInSeconds,
    team_id: req.auth.team_id,
    mode: "map",
    url: req.body.url,
@ -183,55 +226,7 @@ export async function mapController(
  return res.status(200).json({
    success: true,
-    links: linksToReturn,
+    links: results.links.map(l => l.url),
-    scrape_id: req.body.origin?.includes("website") ? id : undefined,
+    scrape_id: results.scrapeId,
  });
 }
 // Subdomain sitemap url checking
 // // For each result, check for subdomains, get their sitemaps and add them to the links
 // const processedUrls = new Set();
 // const processedSubdomains = new Set();
 // for (const result of links) {
 //   let url;
 //   let hostParts;
 //   try {
 //     url = new URL(result);
 //     hostParts = url.hostname.split('.');
 //   } catch (e) {
 //     continue;
 //   }
 //   console.log("hostParts", hostParts);
 //   // Check if it's a subdomain (more than 2 parts, and not 'www')
 //   if (hostParts.length > 2 && hostParts[0] !== 'www') {
 //     const subdomain = hostParts[0];
 //     console.log("subdomain", subdomain);
 //     const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
 //     console.log("subdomainUrl", subdomainUrl);
 //     if (!processedSubdomains.has(subdomainUrl)) {
 //       processedSubdomains.add(subdomainUrl);
 //       const subdomainCrawl = crawlToCrawler(id, {
 //         originUrl: subdomainUrl,
 //         crawlerOptions: legacyCrawlerOptions(req.body),
 //         pageOptions: {},
 //         team_id: req.auth.team_id,
 //         createdAt: Date.now(),
 //         plan: req.auth.plan,
 //       });
 //       const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
 //       if (subdomainSitemap) {
 //         subdomainSitemap.forEach((x) => {
 //           if (!processedUrls.has(x.url)) {
 //             processedUrls.add(x.url);
 //             links.push(x.url);
 //           }
 //         });
 //       }
 //     }
 //   }
 // }
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -121,8 +121,21 @@ export const scrapeOptions = z.object({
 }).strict(strictMessage)
 export type ScrapeOptions = z.infer<typeof scrapeOptions>;
 export const extractV1Options = z.object({
  urls: url.array(),
  prompt: z.string().optional(),
  schema: z.any().optional(),
  origin: z.string().optional().default("api"),
  timeout: z.number().int().positive().finite().safe().default(60000),
 }).strict(strictMessage)
 export type ExtractV1Options = z.infer<typeof extractV1Options>;
 export const extractRequestSchema = extractV1Options;
 export type ExtractRequest = z.infer<typeof extractRequestSchema>;
 export const scrapeRequestSchema = scrapeOptions.extend({
  url,
  origin: z.string().optional().default("api"),
@ -142,6 +155,8 @@ export const scrapeRequestSchema = scrapeOptions.extend({
  return obj;
 });
 export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
 export const batchScrapeRequestSchema = scrapeOptions.extend({
@ -296,6 +311,21 @@ export interface ScrapeResponseRequestTest {
  error?: string;
 }
 export type ExtractResponse =
  | ErrorResponse
  | {
      success: true;
      warning?: string;
      data: Document;
      scrape_id?: string;
    };
 export interface ExtractResponseRequestTest {
  statusCode: number;
  body: ExtractResponse;
  error?: string;
 }
 export type CrawlResponse =
  | ErrorResponse
  | {
@ -492,3 +522,11 @@ export function legacyDocumentConverter(doc: any): Document {
    },
  };
 }
 export interface MapDocument {
  url: string;
  title?: string;
  description?: string;
 }   
--- a/apps/api/src/lib/extract/completions.ts
+++ b/apps/api/src/lib/extract/completions.ts
@ -0,0 +1,119 @@
 import OpenAI from "openai";
 import { encoding_for_model } from "@dqbd/tiktoken";
 import { TiktokenModel } from "@dqbd/tiktoken";
 import { ExtractOptions } from "../../controllers/v1/types";
 import { Document } from "../entities";
 import { z } from "zod";
 const maxTokens = 32000;
 const modifier = 4;
 export class LLMRefusalError extends Error {
  constructor(refusal: string) {
    super("LLM refused to extract the website's content");
    this.name = "LLMRefusalError";
  }
 }
 interface GenerateCompletionsParams {
  systemPrompt?: string;
  prompt?: string;
  schema?: any;
  pagesContent: string;
 }
 export async function generateBasicCompletion(prompt: string) {
  const openai = new OpenAI();
  const model: TiktokenModel =
    (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
  const completion = await openai.chat.completions.create({
    model,
    messages: [{ role: "user", content: prompt }],
  });
  return completion.choices[0].message.content;
 }
 export async function generateFinalExtraction({
  pagesContent,
  systemPrompt,
  prompt,
  schema,
 }: GenerateCompletionsParams): Promise<{
  content: string;
  metadata: { numTokens: number; warning: string };
 }> {
  const openai = new OpenAI();
  const model: TiktokenModel =
    (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
  let extractionContent = pagesContent;
  let numTokens = 0;
  let warning = "";
  const encoder = encoding_for_model(model);
  try {
    const tokens = encoder.encode(extractionContent);
    numTokens = tokens.length;
  } catch (error) {
    extractionContent = extractionContent.slice(0, maxTokens * modifier);
    warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
  } finally {
    encoder.free();
  }
  if (numTokens > maxTokens) {
    extractionContent = extractionContent.slice(0, maxTokens * modifier);
    warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
  }
  if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
    schema = {
      type: "object",
      properties: {
        items: schema,
      },
      required: ["items"],
      additionalProperties: false,
    };
  }
  const jsonCompletion = await openai.beta.chat.completions.parse({
    model,
    messages: [
      { role: "system", content: systemPrompt },
      { role: "user", content: [{ type: "text", text: extractionContent }] },
      {
        role: "user",
        content: prompt
          ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
          : "Transform the above content into structured JSON output.",
      },
    ],
    response_format: schema
      ? {
          type: "json_schema",
          json_schema: {
            name: "websiteContent",
            schema: schema.shape,
            strict: true,
          },
        }
      : { type: "json_object" },
  });
  if (jsonCompletion.choices[0].message.refusal !== null) {
    throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
  }
  const extraction = jsonCompletion.choices[0].message.parsed;
  return {
    content: extraction,
    metadata: {
      numTokens,
      warning,
    },
  };
 }
--- a/apps/api/src/lib/extract/reranker.ts
+++ b/apps/api/src/lib/extract/reranker.ts
@ -0,0 +1,22 @@
 import { CohereClient } from "cohere-ai";
 import { MapDocument } from "../../controllers/v1/types";
 const cohere = new CohereClient({
  token: process.env.COHERE_API_KEY,
 });
 export async function rerankDocuments(
  documents: (string | Record<string, string>)[],
  query: string,
  topN = 3,
  model = "rerank-english-v3.0"
 ) {
  const rerank = await cohere.v2.rerank({
    documents,
    query,
    topN,
    model,
    returnDocuments: true,
  });
  return rerank.results.sort((a, b) => b.relevanceScore - a.relevanceScore).map(x => ({ document: x.document, index: x.index, relevanceScore: x.relevanceScore }));
 }