Nick: wip

This commit is contained in:
Nicolas 2024-10-28 16:02:07 -03:00
parent 8a4f4cb9d9
commit 78badf8f72
7 changed files with 1772 additions and 140 deletions

View File

@ -53,7 +53,7 @@
"@bull-board/api": "^5.20.5",
"@bull-board/express": "^5.20.5",
"@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.13",
"@dqbd/tiktoken": "^1.0.17",
"@hyperdx/node-opentelemetry": "^0.8.1",
"@logtail/node": "^0.4.12",
"@nangohq/node": "^0.40.8",
@ -73,6 +73,7 @@
"cacheable-lookup": "^6.1.0",
"cheerio": "^1.0.0-rc.12",
"cohere": "^1.1.1",
"cohere-ai": "^7.14.0",
"cors": "^2.8.5",
"cron-parser": "^4.9.0",
"date-fns": "^3.6.0",

1324
apps/api/pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,197 @@
import { Request, Response } from "express";
import { Logger } from "../../lib/logger";
import {
Document,
legacyDocumentConverter,
legacyExtractorOptions,
legacyScrapeOptions,
RequestWithAuth,
ExtractRequest,
extractRequestSchema,
ExtractResponse,
legacyCrawlerOptions,
MapDocument,
} from "./types";
import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid";
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { logJob } from "../../services/logging/log_job";
import { getJobPriority } from "../../lib/job-priority";
import { PlanType } from "../../types";
import { getMapResults } from "./map";
import { rerankDocuments } from "../../lib/extract/reranker";
import { generateBasicCompletion } from "../../lib/extract/completions";
export async function extractController(
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
res: Response<ExtractResponse>
) {
req.body = extractRequestSchema.parse(req.body);
let earlyReturn = false;
const origin = req.body.origin;
const timeout = req.body.timeout;
// const pageOptions = legacyScrapeOptions(req.body);
// const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
const jobId = uuidv4();
const startTime = new Date().getTime();
const jobPriority = await getJobPriority({
plan: req.auth.plan as PlanType,
team_id: req.auth.team_id,
basePriority: 10,
});
const urls = req.body.urls;
const mappedDocuments: MapDocument[] = [];
const prompt = req.body.prompt;
const keywords = await generateBasicCompletion(`If the user's prompt is: "${prompt}", what are the most important keywords besides the extraction task? Output only the keywords, separated by commas.`);
for (const url of urls) {
if (url.endsWith("/*")) {
const mapResults = await getMapResults({
url: url.slice(0, -2),
search: req.body.prompt,
limit: 100,
ignoreSitemap: true,
includeSubdomains: false,
crawlerOptions: {},
teamId: req.auth.team_id,
plan: req.auth.plan,
origin: req.body.origin,
subId: req.acuc?.sub_id,
includeMetadata: true
});
// top 3 links
const top3Links = (mapResults.links as MapDocument[]).slice(0, 3);
console.log(top3Links);
// console.log(top3Links);
mappedDocuments.push(...(mapResults.links as MapDocument[]));
// transform mappedUrls to just documents
// we quickly rerank
const rerank = await rerankDocuments(mappedDocuments.map(x => `URL: ${x.url}\nTITLE: ${x.title}\nDESCRIPTION: ${x.description}`), "What URLs are most relevant to the following prompt: " + req.body.prompt.toLocaleLowerCase().replace("extract", " ").replace("extract ", " "));
console.log(rerank);
} else {
mappedDocuments.push({ url });
}
}
req.body.urls = mappedDocuments.map(x => x.url);
// const job = await addScrapeJob(
// {
// url: req.body.url,
// mode: "single_urls",
// crawlerOptions: {},
// team_id: req.auth.team_id,
// plan: req.auth.plan,
// pageOptions,
// extractorOptions,
// origin: req.body.origin,
// is_scrape: true,
// },
// {},
// jobId,
// jobPriority
// );
// const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0);
// let doc: any | undefined;
// try {
// doc = (await waitForJob(job.id, timeout + totalWait))[0];
// } catch (e) {
// Logger.error(`Error in scrapeController: ${e}`);
// if (e instanceof Error && e.message.startsWith("Job wait")) {
// return res.status(408).json({
// success: false,
// error: "Request timed out",
// });
// } else {
// return res.status(500).json({
// success: false,
// error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
// extractorOptions && extractorOptions.mode !== "markdown"
// ? " - Could be due to LLM parsing issues"
// : ""
// }`,
// });
// }
// }
// await job.remove();
// if (!doc) {
// console.error("!!! PANIC DOC IS", doc, job);
// return res.status(200).json({
// success: true,
// warning: "No page found",
// data: doc,
// });
// }
// delete doc.index;
// delete doc.provider;
// const endTime = new Date().getTime();
// const timeTakenInSeconds = (endTime - startTime) / 1000;
// const numTokens =
// doc && doc.markdown
// ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
// : 0;
// let creditsToBeBilled = 1; // Assuming 1 credit per document
// if (earlyReturn) {
// // Don't bill if we're early returning
// return;
// }
// if(req.body.extract && req.body.formats.includes("extract")) {
// creditsToBeBilled = 5;
// }
// billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
// Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
// // Optionally, you could notify an admin or add to a retry queue here
// });
// if (!pageOptions || !pageOptions.includeRawHtml) {
// if (doc && doc.rawHtml) {
// delete doc.rawHtml;
// }
// }
// if(pageOptions && pageOptions.includeExtract) {
// if(!pageOptions.includeMarkdown && doc && doc.markdown) {
// delete doc.markdown;
// }
// }
// logJob({
// job_id: jobId,
// success: true,
// message: "Scrape completed",
// num_docs: 1,
// docs: [doc],
// time_taken: timeTakenInSeconds,
// team_id: req.auth.team_id,
// mode: "scrape",
// url: req.body.url,
// crawlerOptions: {},
// pageOptions: pageOptions,
// origin: origin,
// extractor_options: extractorOptions,
// num_tokens: numTokens,
// });
return res.status(200).json({
success: true,
data: null,
scrape_id: origin?.includes("website") ? jobId : undefined,
});
}

View File

@ -15,11 +15,11 @@ import {
removeDuplicateUrls,
} from "../../lib/validateUrl";
import { fireEngineMap } from "../../search/fireEngine";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
import { performCosineSimilarity } from "../../lib/map-cosine";
import { Logger } from "../../lib/logger";
import Redis from "ioredis";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
configDotenv();
const redis = new Redis(process.env.REDIS_URL);
@ -29,35 +29,50 @@ const MAX_MAP_LIMIT = 5000;
// Max Links that "Smart /map" can return
const MAX_FIRE_ENGINE_RESULTS = 1000;
export async function mapController(
req: RequestWithAuth<{}, MapResponse, MapRequest>,
res: Response<MapResponse>
) {
interface MapOptions {
url: string;
search?: string;
limit?: number;
ignoreSitemap?: boolean;
includeSubdomains?: boolean;
crawlerOptions?: any;
teamId: string;
plan: string;
origin?: string;
subId?: string;
includeMetadata?: boolean;
}
export async function getMapResults({
url,
search,
limit = MAX_MAP_LIMIT,
ignoreSitemap = false,
includeSubdomains = false,
crawlerOptions = {},
teamId,
plan,
origin,
subId,
includeMetadata = false,
}: MapOptions) {
const startTime = new Date().getTime();
req.body = mapRequestSchema.parse(req.body);
const limit: number = req.body.limit ?? MAX_MAP_LIMIT;
const id = uuidv4();
let links: string[] = [req.body.url];
let links: { url: string; title?: string; description?: string }[] = [{ url }];
const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions: legacyCrawlerOptions(req.body),
originUrl: url,
crawlerOptions,
pageOptions: {},
team_id: req.auth.team_id,
team_id: teamId,
createdAt: Date.now(),
plan: req.auth.plan,
plan,
};
const crawler = crawlToCrawler(id, sc);
let urlWithoutWww = req.body.url.replace("www.", "");
let mapUrl = req.body.search
? `"${req.body.search}" site:${urlWithoutWww}`
: `site:${req.body.url}`;
let urlWithoutWww = url.replace("www.", "");
let mapUrl = search ? `"${search}" site:${urlWithoutWww}` : `site:${url}`;
const resultsPerPage = 100;
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
@ -81,12 +96,11 @@ export async function mapController(
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
allResults = await Promise.all(pagePromises);
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60);
}
// Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(),
ignoreSitemap ? null : crawler.tryGetSitemap(),
...(cachedResult ? [] : pagePromises),
]);
@ -96,7 +110,7 @@ export async function mapController(
if (sitemap !== null) {
sitemap.forEach((x) => {
links.push(x.url);
links.push({ url: x.url });
});
}
@ -110,67 +124,96 @@ export async function mapController(
}
if (mapResults.length > 0) {
if (req.body.search) {
// Ensure all map results are first, maintaining their order
if (search) {
links = [
mapResults[0].url,
...mapResults.slice(1).map((x) => x.url),
{ url: mapResults[0].url, title: mapResults[0].title, description: mapResults[0].description },
...mapResults.slice(1).map((x) => ({
url: x.url,
title: x.title,
description: x.description
})),
...links,
];
} else {
mapResults.map((x) => {
links.push(x.url);
mapResults.forEach((x) => {
links.push({
url: x.url,
title: x.title,
description: x.description
});
});
}
}
// Perform cosine similarity between the search query and the list of links
if (req.body.search) {
const searchQuery = req.body.search.toLowerCase();
links = performCosineSimilarity(links, searchQuery);
if (search) {
const filteredLinks = performCosineSimilarity(links.map(l => l.url), search.toLowerCase());
links = links.filter(l => filteredLinks.includes(l.url));
}
links = links
.map((x) => {
try {
return checkAndUpdateURLForMap(x).url.trim();
return { ...x, url: checkAndUpdateURLForMap(x.url).url.trim() };
} catch (_) {
return null;
}
})
.filter((x) => x !== null);
// allows for subdomains to be included
links = links.filter((x) => isSameDomain(x, req.body.url));
links = links.filter((x) => isSameDomain(x.url, url));
// if includeSubdomains is false, filter out subdomains
if (!req.body.includeSubdomains) {
links = links.filter((x) => isSameSubdomain(x, req.body.url));
if (!includeSubdomains) {
links = links.filter((x) => isSameSubdomain(x.url, url));
}
// remove duplicates that could be due to http/https or www
links = removeDuplicateUrls(links);
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
Logger.error(
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
);
// Optionally, you could notify an admin or add to a retry queue here
});
links = removeDuplicateUrls(links.map(l => l.url)).map(url => links.find(l => l.url === url));
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
const linksToReturn = links.slice(0, limit);
logJob({
job_id: id,
success: links.length > 0,
return {
links: includeMetadata ? linksToReturn : linksToReturn.map(l => l.url),
scrapeId: origin?.includes("website") ? id : undefined,
timeTakenInSeconds,
id,
linksLength: links.length,
linksToReturnLength: linksToReturn.length,
docs: linksToReturn.map(l => l.url),
};
}
export async function mapController(
req: RequestWithAuth<{}, MapResponse, MapRequest>,
res: Response<MapResponse>
) {
req.body = mapRequestSchema.parse(req.body);
const results = await getMapResults({
url: req.body.url,
search: req.body.search,
limit: req.body.limit,
ignoreSitemap: req.body.ignoreSitemap,
includeSubdomains: req.body.includeSubdomains,
crawlerOptions: legacyCrawlerOptions(req.body),
teamId: req.auth.team_id,
plan: req.auth.plan,
origin: req.body.origin,
subId: req.acuc?.sub_id,
});
await billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
});
await logJob({
job_id: results.id,
success: results.linksLength > 0,
message: "Map completed",
num_docs: linksToReturn.length,
docs: linksToReturn,
time_taken: timeTakenInSeconds,
num_docs: results.linksToReturnLength,
docs: results.docs,
time_taken: results.timeTakenInSeconds,
team_id: req.auth.team_id,
mode: "map",
url: req.body.url,
@ -183,55 +226,7 @@ export async function mapController(
return res.status(200).json({
success: true,
links: linksToReturn,
scrape_id: req.body.origin?.includes("website") ? id : undefined,
links: results.links.map(l => l.url),
scrape_id: results.scrapeId,
});
}
// Subdomain sitemap url checking
// // For each result, check for subdomains, get their sitemaps and add them to the links
// const processedUrls = new Set();
// const processedSubdomains = new Set();
// for (const result of links) {
// let url;
// let hostParts;
// try {
// url = new URL(result);
// hostParts = url.hostname.split('.');
// } catch (e) {
// continue;
// }
// console.log("hostParts", hostParts);
// // Check if it's a subdomain (more than 2 parts, and not 'www')
// if (hostParts.length > 2 && hostParts[0] !== 'www') {
// const subdomain = hostParts[0];
// console.log("subdomain", subdomain);
// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
// console.log("subdomainUrl", subdomainUrl);
// if (!processedSubdomains.has(subdomainUrl)) {
// processedSubdomains.add(subdomainUrl);
// const subdomainCrawl = crawlToCrawler(id, {
// originUrl: subdomainUrl,
// crawlerOptions: legacyCrawlerOptions(req.body),
// pageOptions: {},
// team_id: req.auth.team_id,
// createdAt: Date.now(),
// plan: req.auth.plan,
// });
// const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
// if (subdomainSitemap) {
// subdomainSitemap.forEach((x) => {
// if (!processedUrls.has(x.url)) {
// processedUrls.add(x.url);
// links.push(x.url);
// }
// });
// }
// }
// }
// }

View File

@ -121,8 +121,21 @@ export const scrapeOptions = z.object({
}).strict(strictMessage)
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const extractV1Options = z.object({
urls: url.array(),
prompt: z.string().optional(),
schema: z.any().optional(),
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000),
}).strict(strictMessage)
export type ExtractV1Options = z.infer<typeof extractV1Options>;
export const extractRequestSchema = extractV1Options;
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
export const scrapeRequestSchema = scrapeOptions.extend({
url,
origin: z.string().optional().default("api"),
@ -142,6 +155,8 @@ export const scrapeRequestSchema = scrapeOptions.extend({
return obj;
});
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
export const batchScrapeRequestSchema = scrapeOptions.extend({
@ -296,6 +311,21 @@ export interface ScrapeResponseRequestTest {
error?: string;
}
export type ExtractResponse =
| ErrorResponse
| {
success: true;
warning?: string;
data: Document;
scrape_id?: string;
};
export interface ExtractResponseRequestTest {
statusCode: number;
body: ExtractResponse;
error?: string;
}
export type CrawlResponse =
| ErrorResponse
| {
@ -492,3 +522,11 @@ export function legacyDocumentConverter(doc: any): Document {
},
};
}
export interface MapDocument {
url: string;
title?: string;
description?: string;
}

View File

@ -0,0 +1,119 @@
import OpenAI from "openai";
import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken";
import { ExtractOptions } from "../../controllers/v1/types";
import { Document } from "../entities";
import { z } from "zod";
const maxTokens = 32000;
const modifier = 4;
export class LLMRefusalError extends Error {
constructor(refusal: string) {
super("LLM refused to extract the website's content");
this.name = "LLMRefusalError";
}
}
interface GenerateCompletionsParams {
systemPrompt?: string;
prompt?: string;
schema?: any;
pagesContent: string;
}
export async function generateBasicCompletion(prompt: string) {
const openai = new OpenAI();
const model: TiktokenModel =
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
const completion = await openai.chat.completions.create({
model,
messages: [{ role: "user", content: prompt }],
});
return completion.choices[0].message.content;
}
export async function generateFinalExtraction({
pagesContent,
systemPrompt,
prompt,
schema,
}: GenerateCompletionsParams): Promise<{
content: string;
metadata: { numTokens: number; warning: string };
}> {
const openai = new OpenAI();
const model: TiktokenModel =
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
let extractionContent = pagesContent;
let numTokens = 0;
let warning = "";
const encoder = encoding_for_model(model);
try {
const tokens = encoder.encode(extractionContent);
numTokens = tokens.length;
} catch (error) {
extractionContent = extractionContent.slice(0, maxTokens * modifier);
warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
} finally {
encoder.free();
}
if (numTokens > maxTokens) {
extractionContent = extractionContent.slice(0, maxTokens * modifier);
warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
}
if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
schema = {
type: "object",
properties: {
items: schema,
},
required: ["items"],
additionalProperties: false,
};
}
const jsonCompletion = await openai.beta.chat.completions.parse({
model,
messages: [
{ role: "system", content: systemPrompt },
{ role: "user", content: [{ type: "text", text: extractionContent }] },
{
role: "user",
content: prompt
? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
: "Transform the above content into structured JSON output.",
},
],
response_format: schema
? {
type: "json_schema",
json_schema: {
name: "websiteContent",
schema: schema.shape,
strict: true,
},
}
: { type: "json_object" },
});
if (jsonCompletion.choices[0].message.refusal !== null) {
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
}
const extraction = jsonCompletion.choices[0].message.parsed;
return {
content: extraction,
metadata: {
numTokens,
warning,
},
};
}

View File

@ -0,0 +1,22 @@
import { CohereClient } from "cohere-ai";
import { MapDocument } from "../../controllers/v1/types";
const cohere = new CohereClient({
token: process.env.COHERE_API_KEY,
});
export async function rerankDocuments(
documents: (string | Record<string, string>)[],
query: string,
topN = 3,
model = "rerank-english-v3.0"
) {
const rerank = await cohere.v2.rerank({
documents,
query,
topN,
model,
returnDocuments: true,
});
return rerank.results.sort((a, b) => b.relevanceScore - a.relevanceScore).map(x => ({ document: x.document, index: x.index, relevanceScore: x.relevanceScore }));
}