Nick: wip

2025-11-25 14:41:22 +00:00 · 2024-10-28 16:02:07 -03:00 · 2024-10-28 16:02:07 -03:00 · 78badf8f72
commit 78badf8f72
parent 8a4f4cb9d9
7 changed files with 1772 additions and 140 deletions
--- a/apps/api/package.json
+++ b/apps/api/package.json
@ -53,7 +53,7 @@
    "@bull-board/api": "^5.20.5",
    "@bull-board/express": "^5.20.5",
    "@devil7softwares/pos": "^1.0.2",
-    "@dqbd/tiktoken": "^1.0.13",
+    "@dqbd/tiktoken": "^1.0.17",
    "@hyperdx/node-opentelemetry": "^0.8.1",
    "@logtail/node": "^0.4.12",
    "@nangohq/node": "^0.40.8",
@ -73,6 +73,7 @@
    "cacheable-lookup": "^6.1.0",
    "cheerio": "^1.0.0-rc.12",
    "cohere": "^1.1.1",
+    "cohere-ai": "^7.14.0",
    "cors": "^2.8.5",
    "cron-parser": "^4.9.0",
    "date-fns": "^3.6.0",
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@ -0,0 +1,197 @@
+import { Request, Response } from "express";
+import { Logger } from "../../lib/logger";
+import {
+  Document,
+  legacyDocumentConverter,
+  legacyExtractorOptions,
+  legacyScrapeOptions,
+  RequestWithAuth,
+  ExtractRequest,
+  extractRequestSchema,
+  ExtractResponse,
+  legacyCrawlerOptions,
+  MapDocument,
+} from "./types";
+import { billTeam } from "../../services/billing/credit_billing";
+import { v4 as uuidv4 } from "uuid";
+import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
+import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
+import { logJob } from "../../services/logging/log_job";
+import { getJobPriority } from "../../lib/job-priority";
+import { PlanType } from "../../types";
+import { getMapResults } from "./map";
+import { rerankDocuments } from "../../lib/extract/reranker";
+import { generateBasicCompletion } from "../../lib/extract/completions";
+
+
+
+export async function extractController(
+  req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
+  res: Response<ExtractResponse>
+) {
+  req.body = extractRequestSchema.parse(req.body);
+  let earlyReturn = false;
+
+  const origin = req.body.origin;
+  const timeout = req.body.timeout;
+//   const pageOptions = legacyScrapeOptions(req.body);
+//   const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
+  const jobId = uuidv4();
+
+  const startTime = new Date().getTime();
+  const jobPriority = await getJobPriority({
+    plan: req.auth.plan as PlanType,
+    team_id: req.auth.team_id,
+    basePriority: 10,
+  });
+
+  const urls = req.body.urls;
+  const mappedDocuments: MapDocument[] = [];
+
+  const prompt = req.body.prompt;
+  const keywords = await generateBasicCompletion(`If the user's prompt is: "${prompt}", what are the most important keywords besides the extraction task? Output only the keywords, separated by commas.`);
+
+  for (const url of urls) {
+    if (url.endsWith("/*")) {
+      const mapResults = await getMapResults({
+        url: url.slice(0, -2),
+        search: req.body.prompt,
+        limit: 100,
+        ignoreSitemap: true,
+        includeSubdomains: false,
+        crawlerOptions: {},
+        teamId: req.auth.team_id,
+        plan: req.auth.plan,
+        origin: req.body.origin,
+        subId: req.acuc?.sub_id,
+        includeMetadata: true
+      });
+      // top 3 links 
+      const top3Links = (mapResults.links as MapDocument[]).slice(0, 3);
+      console.log(top3Links);
+    //   console.log(top3Links);
+      mappedDocuments.push(...(mapResults.links as MapDocument[]));
+       // transform mappedUrls to just documents
+  // we quickly rerank
+      const rerank = await rerankDocuments(mappedDocuments.map(x => `URL: ${x.url}\nTITLE: ${x.title}\nDESCRIPTION: ${x.description}`), "What URLs are most relevant to the following prompt: " + req.body.prompt.toLocaleLowerCase().replace("extract", " ").replace("extract ", " "));
+      console.log(rerank);
+    } else {
+        mappedDocuments.push({ url });
+    }
+  }
+
+  req.body.urls = mappedDocuments.map(x => x.url);
+
+ 
+
+//   const job = await addScrapeJob(
+//     {
+//       url: req.body.url,
+//       mode: "single_urls",
+//       crawlerOptions: {},
+//       team_id: req.auth.team_id,
+//       plan: req.auth.plan,
+//       pageOptions,
+//       extractorOptions,
+//       origin: req.body.origin,
+//       is_scrape: true,
+//     },
+//     {},
+//     jobId,
+//     jobPriority
+//   );
+
+//   const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0);
+
+//   let doc: any | undefined;
+//   try {
+//     doc = (await waitForJob(job.id, timeout + totalWait))[0];
+//   } catch (e) {
+//     Logger.error(`Error in scrapeController: ${e}`);
+//     if (e instanceof Error && e.message.startsWith("Job wait")) {
+//       return res.status(408).json({
+//         success: false,
+//         error: "Request timed out",
+//       });
+//     } else {
+//       return res.status(500).json({
+//         success: false,
+//         error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
+//           extractorOptions && extractorOptions.mode !== "markdown"
+//             ? " - Could be due to LLM parsing issues"
+//             : ""
+//         }`,
+//       });
+//     }
+//   }
+
+//   await job.remove();
+
+//   if (!doc) {
+//     console.error("!!! PANIC DOC IS", doc, job);
+//     return res.status(200).json({
+//       success: true,
+//       warning: "No page found",
+//       data: doc,
+//     });
+//   }
+
+//   delete doc.index;
+//   delete doc.provider;
+
+//   const endTime = new Date().getTime();
+//   const timeTakenInSeconds = (endTime - startTime) / 1000;
+//   const numTokens =
+//     doc && doc.markdown
+//       ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
+//       : 0;
+
+//   let creditsToBeBilled = 1; // Assuming 1 credit per document
+//   if (earlyReturn) {
+//     // Don't bill if we're early returning
+//     return;
+//   }
+//   if(req.body.extract && req.body.formats.includes("extract")) {
+//     creditsToBeBilled = 5;
+//   }
+
+//   billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
+//     Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
+//     // Optionally, you could notify an admin or add to a retry queue here
+//   });
+
+//   if (!pageOptions || !pageOptions.includeRawHtml) {
+//     if (doc && doc.rawHtml) {
+//       delete doc.rawHtml;
+//     }
+//   }
+
+//   if(pageOptions && pageOptions.includeExtract) {
+//     if(!pageOptions.includeMarkdown && doc && doc.markdown) {
+//       delete doc.markdown;
+//     }
+//   }
+
+//   logJob({
+//     job_id: jobId,
+//     success: true,
+//     message: "Scrape completed",
+//     num_docs: 1,
+//     docs: [doc],
+//     time_taken: timeTakenInSeconds,
+//     team_id: req.auth.team_id,
+//     mode: "scrape",
+//     url: req.body.url,
+//     crawlerOptions: {},
+//     pageOptions: pageOptions,
+//     origin: origin,
+//     extractor_options: extractorOptions,
+//     num_tokens: numTokens,
+//   });
+
+  return res.status(200).json({
+    success: true,
+    data: null,
+    scrape_id: origin?.includes("website") ? jobId : undefined,
+  });
+}
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -15,11 +15,11 @@ import {
  removeDuplicateUrls,
 } from "../../lib/validateUrl";
 import { fireEngineMap } from "../../search/fireEngine";
-import { billTeam } from "../../services/billing/credit_billing";
-import { logJob } from "../../services/logging/log_job";
 import { performCosineSimilarity } from "../../lib/map-cosine";
 import { Logger } from "../../lib/logger";
 import Redis from "ioredis";
+import { billTeam } from "../../services/billing/credit_billing";
+import { logJob } from "../../services/logging/log_job";

 configDotenv();
 const redis = new Redis(process.env.REDIS_URL);
@ -29,35 +29,50 @@ const MAX_MAP_LIMIT = 5000;
 // Max Links that "Smart /map" can return
 const MAX_FIRE_ENGINE_RESULTS = 1000;

-export async function mapController(
-  req: RequestWithAuth<{}, MapResponse, MapRequest>,
-  res: Response<MapResponse>
-) {
+interface MapOptions {
+  url: string;
+  search?: string;
+  limit?: number;
+  ignoreSitemap?: boolean;
+  includeSubdomains?: boolean;
+  crawlerOptions?: any;
+  teamId: string;
+  plan: string;
+  origin?: string;
+  subId?: string;
+  includeMetadata?: boolean;
+}
+
+export async function getMapResults({
+  url,
+  search,
+  limit = MAX_MAP_LIMIT,
+  ignoreSitemap = false,
+  includeSubdomains = false,
+  crawlerOptions = {},
+  teamId,
+  plan,
+  origin,
+  subId,
+  includeMetadata = false,
+}: MapOptions) {
  const startTime = new Date().getTime();
-
-  req.body = mapRequestSchema.parse(req.body);
-
-  const limit: number = req.body.limit ?? MAX_MAP_LIMIT;
-
  const id = uuidv4();
-  let links: string[] = [req.body.url];
+  let links: { url: string; title?: string; description?: string }[] = [{ url }];

  const sc: StoredCrawl = {
-    originUrl: req.body.url,
-    crawlerOptions: legacyCrawlerOptions(req.body),
+    originUrl: url,
+    crawlerOptions,
    pageOptions: {},
-    team_id: req.auth.team_id,
+    team_id: teamId,
    createdAt: Date.now(),
-    plan: req.auth.plan,
+    plan,
  };

  const crawler = crawlToCrawler(id, sc);

-  let urlWithoutWww = req.body.url.replace("www.", "");
-
-  let mapUrl = req.body.search
-    ? `"${req.body.search}" site:${urlWithoutWww}`
-    : `site:${req.body.url}`;
+  let urlWithoutWww = url.replace("www.", "");
+  let mapUrl = search ? `"${search}" site:${urlWithoutWww}` : `site:${url}`;

  const resultsPerPage = 100;
  const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
@ -81,12 +96,11 @@ export async function mapController(
    pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
    allResults = await Promise.all(pagePromises);

-    await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
+    await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60);
  }

-  // Parallelize sitemap fetch with serper search
  const [sitemap, ...searchResults] = await Promise.all([
-    req.body.ignoreSitemap ? null : crawler.tryGetSitemap(),
+    ignoreSitemap ? null : crawler.tryGetSitemap(),
    ...(cachedResult ? [] : pagePromises),
  ]);

@ -96,7 +110,7 @@ export async function mapController(

  if (sitemap !== null) {
    sitemap.forEach((x) => {
-      links.push(x.url);
+      links.push({ url: x.url });
    });
  }

@ -110,67 +124,96 @@ export async function mapController(
  }

  if (mapResults.length > 0) {
-    if (req.body.search) {
-      // Ensure all map results are first, maintaining their order
+    if (search) {
      links = [
-        mapResults[0].url,
-        ...mapResults.slice(1).map((x) => x.url),
+        { url: mapResults[0].url, title: mapResults[0].title, description: mapResults[0].description },
+        ...mapResults.slice(1).map((x) => ({ 
+          url: x.url,
+          title: x.title,
+          description: x.description
+        })),
        ...links,
      ];
    } else {
-      mapResults.map((x) => {
-        links.push(x.url);
+      mapResults.forEach((x) => {
+        links.push({ 
+          url: x.url,
+          title: x.title,
+          description: x.description
+        });
      });
    }
  }

-  // Perform cosine similarity between the search query and the list of links
-  if (req.body.search) {
-    const searchQuery = req.body.search.toLowerCase();
-
-    links = performCosineSimilarity(links, searchQuery);
+  if (search) {
+    const filteredLinks = performCosineSimilarity(links.map(l => l.url), search.toLowerCase());
+    links = links.filter(l => filteredLinks.includes(l.url));
  }

  links = links
    .map((x) => {
      try {
-        return checkAndUpdateURLForMap(x).url.trim();
+        return { ...x, url: checkAndUpdateURLForMap(x.url).url.trim() };
      } catch (_) {
        return null;
      }
    })
    .filter((x) => x !== null);

-  // allows for subdomains to be included
-  links = links.filter((x) => isSameDomain(x, req.body.url));
+  links = links.filter((x) => isSameDomain(x.url, url));

-  // if includeSubdomains is false, filter out subdomains
-  if (!req.body.includeSubdomains) {
-    links = links.filter((x) => isSameSubdomain(x, req.body.url));
+  if (!includeSubdomains) {
+    links = links.filter((x) => isSameSubdomain(x.url, url));
  }

-  // remove duplicates that could be due to http/https or www
-  links = removeDuplicateUrls(links);
-
-  billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
-    Logger.error(
-      `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
-    );
-    // Optionally, you could notify an admin or add to a retry queue here
-  });
+  links = removeDuplicateUrls(links.map(l => l.url)).map(url => links.find(l => l.url === url));

  const endTime = new Date().getTime();
  const timeTakenInSeconds = (endTime - startTime) / 1000;

  const linksToReturn = links.slice(0, limit);

-  logJob({
-    job_id: id,
-    success: links.length > 0,
+  return {
+    links: includeMetadata ? linksToReturn : linksToReturn.map(l => l.url),
+    scrapeId: origin?.includes("website") ? id : undefined,
+    timeTakenInSeconds,
+    id,
+    linksLength: links.length,
+    linksToReturnLength: linksToReturn.length,
+    docs: linksToReturn.map(l => l.url),
+  };
+}
+
+export async function mapController(
+  req: RequestWithAuth<{}, MapResponse, MapRequest>,
+  res: Response<MapResponse>
+) {
+  req.body = mapRequestSchema.parse(req.body);
+
+  const results = await getMapResults({
+    url: req.body.url,
+    search: req.body.search,
+    limit: req.body.limit,
+    ignoreSitemap: req.body.ignoreSitemap,
+    includeSubdomains: req.body.includeSubdomains,
+    crawlerOptions: legacyCrawlerOptions(req.body),
+    teamId: req.auth.team_id,
+    plan: req.auth.plan,
+    origin: req.body.origin,
+    subId: req.acuc?.sub_id,
+  });
+
+  await billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
+    Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
+  });
+
+  await logJob({
+    job_id: results.id,
+    success: results.linksLength > 0,
    message: "Map completed",
-    num_docs: linksToReturn.length,
-    docs: linksToReturn,
-    time_taken: timeTakenInSeconds,
+    num_docs: results.linksToReturnLength,
+    docs: results.docs,
+    time_taken: results.timeTakenInSeconds,
    team_id: req.auth.team_id,
    mode: "map",
    url: req.body.url,
@ -183,55 +226,7 @@ export async function mapController(

  return res.status(200).json({
    success: true,
-    links: linksToReturn,
-    scrape_id: req.body.origin?.includes("website") ? id : undefined,
+    links: results.links.map(l => l.url),
+    scrape_id: results.scrapeId,
  });
 }
-
-// Subdomain sitemap url checking
-
-// // For each result, check for subdomains, get their sitemaps and add them to the links
-// const processedUrls = new Set();
-// const processedSubdomains = new Set();
-
-// for (const result of links) {
-//   let url;
-//   let hostParts;
-//   try {
-//     url = new URL(result);
-//     hostParts = url.hostname.split('.');
-//   } catch (e) {
-//     continue;
-//   }
-
-//   console.log("hostParts", hostParts);
-//   // Check if it's a subdomain (more than 2 parts, and not 'www')
-//   if (hostParts.length > 2 && hostParts[0] !== 'www') {
-//     const subdomain = hostParts[0];
-//     console.log("subdomain", subdomain);
-//     const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
-//     console.log("subdomainUrl", subdomainUrl);
-
-//     if (!processedSubdomains.has(subdomainUrl)) {
-//       processedSubdomains.add(subdomainUrl);
-
-//       const subdomainCrawl = crawlToCrawler(id, {
-//         originUrl: subdomainUrl,
-//         crawlerOptions: legacyCrawlerOptions(req.body),
-//         pageOptions: {},
-//         team_id: req.auth.team_id,
-//         createdAt: Date.now(),
-//         plan: req.auth.plan,
-//       });
-//       const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
-//       if (subdomainSitemap) {
-//         subdomainSitemap.forEach((x) => {
-//           if (!processedUrls.has(x.url)) {
-//             processedUrls.add(x.url);
-//             links.push(x.url);
-//           }
-//         });
-//       }
-//     }
-//   }
-// }
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -121,8 +121,21 @@ export const scrapeOptions = z.object({
 }).strict(strictMessage)


+
 export type ScrapeOptions = z.infer<typeof scrapeOptions>;

+export const extractV1Options = z.object({
+  urls: url.array(),
+  prompt: z.string().optional(),
+  schema: z.any().optional(),
+  origin: z.string().optional().default("api"),
+  timeout: z.number().int().positive().finite().safe().default(60000),
+}).strict(strictMessage)
+
+export type ExtractV1Options = z.infer<typeof extractV1Options>;
+export const extractRequestSchema = extractV1Options;
+export type ExtractRequest = z.infer<typeof extractRequestSchema>;
+
 export const scrapeRequestSchema = scrapeOptions.extend({
  url,
  origin: z.string().optional().default("api"),
@ -142,6 +155,8 @@ export const scrapeRequestSchema = scrapeOptions.extend({
  return obj;
 });

+
+
 export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;

 export const batchScrapeRequestSchema = scrapeOptions.extend({
@ -296,6 +311,21 @@ export interface ScrapeResponseRequestTest {
  error?: string;
 }

+export type ExtractResponse =
+  | ErrorResponse
+  | {
+      success: true;
+      warning?: string;
+      data: Document;
+      scrape_id?: string;
+    };
+
+export interface ExtractResponseRequestTest {
+  statusCode: number;
+  body: ExtractResponse;
+  error?: string;
+}
+
 export type CrawlResponse =
  | ErrorResponse
  | {
@ -492,3 +522,11 @@ export function legacyDocumentConverter(doc: any): Document {
    },
  };
 }
+
+
+
+export interface MapDocument {
+  url: string;
+  title?: string;
+  description?: string;
+}   
--- a/apps/api/src/lib/extract/completions.ts
+++ b/apps/api/src/lib/extract/completions.ts
@ -0,0 +1,119 @@
+import OpenAI from "openai";
+import { encoding_for_model } from "@dqbd/tiktoken";
+import { TiktokenModel } from "@dqbd/tiktoken";
+import { ExtractOptions } from "../../controllers/v1/types";
+import { Document } from "../entities";
+import { z } from "zod";
+
+const maxTokens = 32000;
+const modifier = 4;
+
+export class LLMRefusalError extends Error {
+  constructor(refusal: string) {
+    super("LLM refused to extract the website's content");
+    this.name = "LLMRefusalError";
+  }
+}
+
+interface GenerateCompletionsParams {
+  systemPrompt?: string;
+  prompt?: string;
+  schema?: any;
+  pagesContent: string;
+}
+
+export async function generateBasicCompletion(prompt: string) {
+  const openai = new OpenAI();
+  const model: TiktokenModel =
+    (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
+
+  const completion = await openai.chat.completions.create({
+    model,
+    messages: [{ role: "user", content: prompt }],
+  });
+
+  return completion.choices[0].message.content;
+}
+
+export async function generateFinalExtraction({
+  pagesContent,
+  systemPrompt,
+  prompt,
+  schema,
+}: GenerateCompletionsParams): Promise<{
+  content: string;
+  metadata: { numTokens: number; warning: string };
+}> {
+  const openai = new OpenAI();
+  const model: TiktokenModel =
+    (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
+
+  let extractionContent = pagesContent;
+  let numTokens = 0;
+  let warning = "";
+
+  const encoder = encoding_for_model(model);
+  try {
+    const tokens = encoder.encode(extractionContent);
+    numTokens = tokens.length;
+  } catch (error) {
+    extractionContent = extractionContent.slice(0, maxTokens * modifier);
+    warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
+  } finally {
+    encoder.free();
+  }
+
+  if (numTokens > maxTokens) {
+    extractionContent = extractionContent.slice(0, maxTokens * modifier);
+    warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
+  }
+
+  if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
+    schema = {
+      type: "object",
+      properties: {
+        items: schema,
+      },
+      required: ["items"],
+      additionalProperties: false,
+    };
+  }
+
+  const jsonCompletion = await openai.beta.chat.completions.parse({
+    model,
+    messages: [
+      { role: "system", content: systemPrompt },
+      { role: "user", content: [{ type: "text", text: extractionContent }] },
+      {
+        role: "user",
+        content: prompt
+          ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
+          : "Transform the above content into structured JSON output.",
+      },
+    ],
+    response_format: schema
+      ? {
+          type: "json_schema",
+          json_schema: {
+            name: "websiteContent",
+            schema: schema.shape,
+            strict: true,
+          },
+        }
+      : { type: "json_object" },
+  });
+
+  if (jsonCompletion.choices[0].message.refusal !== null) {
+    throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
+  }
+
+  const extraction = jsonCompletion.choices[0].message.parsed;
+
+  return {
+    content: extraction,
+    metadata: {
+      numTokens,
+      warning,
+    },
+  };
+}
--- a/apps/api/src/lib/extract/reranker.ts
+++ b/apps/api/src/lib/extract/reranker.ts
@ -0,0 +1,22 @@
+import { CohereClient } from "cohere-ai";
+import { MapDocument } from "../../controllers/v1/types";
+const cohere = new CohereClient({
+  token: process.env.COHERE_API_KEY,
+});
+
+export async function rerankDocuments(
+  documents: (string | Record<string, string>)[],
+  query: string,
+  topN = 3,
+  model = "rerank-english-v3.0"
+) {
+  const rerank = await cohere.v2.rerank({
+    documents,
+    query,
+    topN,
+    model,
+    returnDocuments: true,
+  });
+
+  return rerank.results.sort((a, b) => b.relevanceScore - a.relevanceScore).map(x => ({ document: x.document, index: x.index, relevanceScore: x.relevanceScore }));
+}