mirror of
https://github.com/mendableai/firecrawl.git
synced 2025-07-29 20:10:59 +00:00
Nick: wip
This commit is contained in:
parent
8a4f4cb9d9
commit
78badf8f72
@ -53,7 +53,7 @@
|
||||
"@bull-board/api": "^5.20.5",
|
||||
"@bull-board/express": "^5.20.5",
|
||||
"@devil7softwares/pos": "^1.0.2",
|
||||
"@dqbd/tiktoken": "^1.0.13",
|
||||
"@dqbd/tiktoken": "^1.0.17",
|
||||
"@hyperdx/node-opentelemetry": "^0.8.1",
|
||||
"@logtail/node": "^0.4.12",
|
||||
"@nangohq/node": "^0.40.8",
|
||||
@ -73,6 +73,7 @@
|
||||
"cacheable-lookup": "^6.1.0",
|
||||
"cheerio": "^1.0.0-rc.12",
|
||||
"cohere": "^1.1.1",
|
||||
"cohere-ai": "^7.14.0",
|
||||
"cors": "^2.8.5",
|
||||
"cron-parser": "^4.9.0",
|
||||
"date-fns": "^3.6.0",
|
||||
|
1324
apps/api/pnpm-lock.yaml
generated
1324
apps/api/pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
197
apps/api/src/controllers/v1/extract.ts
Normal file
197
apps/api/src/controllers/v1/extract.ts
Normal file
@ -0,0 +1,197 @@
|
||||
import { Request, Response } from "express";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import {
|
||||
Document,
|
||||
legacyDocumentConverter,
|
||||
legacyExtractorOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
ExtractRequest,
|
||||
extractRequestSchema,
|
||||
ExtractResponse,
|
||||
legacyCrawlerOptions,
|
||||
MapDocument,
|
||||
} from "./types";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { PlanType } from "../../types";
|
||||
import { getMapResults } from "./map";
|
||||
import { rerankDocuments } from "../../lib/extract/reranker";
|
||||
import { generateBasicCompletion } from "../../lib/extract/completions";
|
||||
|
||||
|
||||
|
||||
export async function extractController(
|
||||
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
||||
res: Response<ExtractResponse>
|
||||
) {
|
||||
req.body = extractRequestSchema.parse(req.body);
|
||||
let earlyReturn = false;
|
||||
|
||||
const origin = req.body.origin;
|
||||
const timeout = req.body.timeout;
|
||||
// const pageOptions = legacyScrapeOptions(req.body);
|
||||
// const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const jobPriority = await getJobPriority({
|
||||
plan: req.auth.plan as PlanType,
|
||||
team_id: req.auth.team_id,
|
||||
basePriority: 10,
|
||||
});
|
||||
|
||||
const urls = req.body.urls;
|
||||
const mappedDocuments: MapDocument[] = [];
|
||||
|
||||
const prompt = req.body.prompt;
|
||||
const keywords = await generateBasicCompletion(`If the user's prompt is: "${prompt}", what are the most important keywords besides the extraction task? Output only the keywords, separated by commas.`);
|
||||
|
||||
for (const url of urls) {
|
||||
if (url.endsWith("/*")) {
|
||||
const mapResults = await getMapResults({
|
||||
url: url.slice(0, -2),
|
||||
search: req.body.prompt,
|
||||
limit: 100,
|
||||
ignoreSitemap: true,
|
||||
includeSubdomains: false,
|
||||
crawlerOptions: {},
|
||||
teamId: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
origin: req.body.origin,
|
||||
subId: req.acuc?.sub_id,
|
||||
includeMetadata: true
|
||||
});
|
||||
// top 3 links
|
||||
const top3Links = (mapResults.links as MapDocument[]).slice(0, 3);
|
||||
console.log(top3Links);
|
||||
// console.log(top3Links);
|
||||
mappedDocuments.push(...(mapResults.links as MapDocument[]));
|
||||
// transform mappedUrls to just documents
|
||||
// we quickly rerank
|
||||
const rerank = await rerankDocuments(mappedDocuments.map(x => `URL: ${x.url}\nTITLE: ${x.title}\nDESCRIPTION: ${x.description}`), "What URLs are most relevant to the following prompt: " + req.body.prompt.toLocaleLowerCase().replace("extract", " ").replace("extract ", " "));
|
||||
console.log(rerank);
|
||||
} else {
|
||||
mappedDocuments.push({ url });
|
||||
}
|
||||
}
|
||||
|
||||
req.body.urls = mappedDocuments.map(x => x.url);
|
||||
|
||||
|
||||
|
||||
// const job = await addScrapeJob(
|
||||
// {
|
||||
// url: req.body.url,
|
||||
// mode: "single_urls",
|
||||
// crawlerOptions: {},
|
||||
// team_id: req.auth.team_id,
|
||||
// plan: req.auth.plan,
|
||||
// pageOptions,
|
||||
// extractorOptions,
|
||||
// origin: req.body.origin,
|
||||
// is_scrape: true,
|
||||
// },
|
||||
// {},
|
||||
// jobId,
|
||||
// jobPriority
|
||||
// );
|
||||
|
||||
// const totalWait = (req.body.waitFor ?? 0) + (req.body.actions ?? []).reduce((a,x) => (x.type === "wait" ? x.milliseconds : 0) + a, 0);
|
||||
|
||||
// let doc: any | undefined;
|
||||
// try {
|
||||
// doc = (await waitForJob(job.id, timeout + totalWait))[0];
|
||||
// } catch (e) {
|
||||
// Logger.error(`Error in scrapeController: ${e}`);
|
||||
// if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
// return res.status(408).json({
|
||||
// success: false,
|
||||
// error: "Request timed out",
|
||||
// });
|
||||
// } else {
|
||||
// return res.status(500).json({
|
||||
// success: false,
|
||||
// error: `(Internal server error) - ${e && e?.message ? e.message : e} ${
|
||||
// extractorOptions && extractorOptions.mode !== "markdown"
|
||||
// ? " - Could be due to LLM parsing issues"
|
||||
// : ""
|
||||
// }`,
|
||||
// });
|
||||
// }
|
||||
// }
|
||||
|
||||
// await job.remove();
|
||||
|
||||
// if (!doc) {
|
||||
// console.error("!!! PANIC DOC IS", doc, job);
|
||||
// return res.status(200).json({
|
||||
// success: true,
|
||||
// warning: "No page found",
|
||||
// data: doc,
|
||||
// });
|
||||
// }
|
||||
|
||||
// delete doc.index;
|
||||
// delete doc.provider;
|
||||
|
||||
// const endTime = new Date().getTime();
|
||||
// const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
// const numTokens =
|
||||
// doc && doc.markdown
|
||||
// ? numTokensFromString(doc.markdown, "gpt-3.5-turbo")
|
||||
// : 0;
|
||||
|
||||
// let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
// if (earlyReturn) {
|
||||
// // Don't bill if we're early returning
|
||||
// return;
|
||||
// }
|
||||
// if(req.body.extract && req.body.formats.includes("extract")) {
|
||||
// creditsToBeBilled = 5;
|
||||
// }
|
||||
|
||||
// billTeam(req.auth.team_id, req.acuc?.sub_id, creditsToBeBilled).catch(error => {
|
||||
// Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
|
||||
// // Optionally, you could notify an admin or add to a retry queue here
|
||||
// });
|
||||
|
||||
// if (!pageOptions || !pageOptions.includeRawHtml) {
|
||||
// if (doc && doc.rawHtml) {
|
||||
// delete doc.rawHtml;
|
||||
// }
|
||||
// }
|
||||
|
||||
// if(pageOptions && pageOptions.includeExtract) {
|
||||
// if(!pageOptions.includeMarkdown && doc && doc.markdown) {
|
||||
// delete doc.markdown;
|
||||
// }
|
||||
// }
|
||||
|
||||
// logJob({
|
||||
// job_id: jobId,
|
||||
// success: true,
|
||||
// message: "Scrape completed",
|
||||
// num_docs: 1,
|
||||
// docs: [doc],
|
||||
// time_taken: timeTakenInSeconds,
|
||||
// team_id: req.auth.team_id,
|
||||
// mode: "scrape",
|
||||
// url: req.body.url,
|
||||
// crawlerOptions: {},
|
||||
// pageOptions: pageOptions,
|
||||
// origin: origin,
|
||||
// extractor_options: extractorOptions,
|
||||
// num_tokens: numTokens,
|
||||
// });
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: null,
|
||||
scrape_id: origin?.includes("website") ? jobId : undefined,
|
||||
});
|
||||
}
|
@ -15,11 +15,11 @@ import {
|
||||
removeDuplicateUrls,
|
||||
} from "../../lib/validateUrl";
|
||||
import { fireEngineMap } from "../../search/fireEngine";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { performCosineSimilarity } from "../../lib/map-cosine";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import Redis from "ioredis";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
|
||||
configDotenv();
|
||||
const redis = new Redis(process.env.REDIS_URL);
|
||||
@ -29,35 +29,50 @@ const MAX_MAP_LIMIT = 5000;
|
||||
// Max Links that "Smart /map" can return
|
||||
const MAX_FIRE_ENGINE_RESULTS = 1000;
|
||||
|
||||
export async function mapController(
|
||||
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
||||
res: Response<MapResponse>
|
||||
) {
|
||||
interface MapOptions {
|
||||
url: string;
|
||||
search?: string;
|
||||
limit?: number;
|
||||
ignoreSitemap?: boolean;
|
||||
includeSubdomains?: boolean;
|
||||
crawlerOptions?: any;
|
||||
teamId: string;
|
||||
plan: string;
|
||||
origin?: string;
|
||||
subId?: string;
|
||||
includeMetadata?: boolean;
|
||||
}
|
||||
|
||||
export async function getMapResults({
|
||||
url,
|
||||
search,
|
||||
limit = MAX_MAP_LIMIT,
|
||||
ignoreSitemap = false,
|
||||
includeSubdomains = false,
|
||||
crawlerOptions = {},
|
||||
teamId,
|
||||
plan,
|
||||
origin,
|
||||
subId,
|
||||
includeMetadata = false,
|
||||
}: MapOptions) {
|
||||
const startTime = new Date().getTime();
|
||||
|
||||
req.body = mapRequestSchema.parse(req.body);
|
||||
|
||||
const limit: number = req.body.limit ?? MAX_MAP_LIMIT;
|
||||
|
||||
const id = uuidv4();
|
||||
let links: string[] = [req.body.url];
|
||||
let links: { url: string; title?: string; description?: string }[] = [{ url }];
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: legacyCrawlerOptions(req.body),
|
||||
originUrl: url,
|
||||
crawlerOptions,
|
||||
pageOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
team_id: teamId,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
plan,
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
let urlWithoutWww = req.body.url.replace("www.", "");
|
||||
|
||||
let mapUrl = req.body.search
|
||||
? `"${req.body.search}" site:${urlWithoutWww}`
|
||||
: `site:${req.body.url}`;
|
||||
let urlWithoutWww = url.replace("www.", "");
|
||||
let mapUrl = search ? `"${search}" site:${urlWithoutWww}` : `site:${url}`;
|
||||
|
||||
const resultsPerPage = 100;
|
||||
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
|
||||
@ -81,12 +96,11 @@ export async function mapController(
|
||||
pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
|
||||
allResults = await Promise.all(pagePromises);
|
||||
|
||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
|
||||
await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60);
|
||||
}
|
||||
|
||||
// Parallelize sitemap fetch with serper search
|
||||
const [sitemap, ...searchResults] = await Promise.all([
|
||||
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(),
|
||||
ignoreSitemap ? null : crawler.tryGetSitemap(),
|
||||
...(cachedResult ? [] : pagePromises),
|
||||
]);
|
||||
|
||||
@ -96,7 +110,7 @@ export async function mapController(
|
||||
|
||||
if (sitemap !== null) {
|
||||
sitemap.forEach((x) => {
|
||||
links.push(x.url);
|
||||
links.push({ url: x.url });
|
||||
});
|
||||
}
|
||||
|
||||
@ -110,67 +124,96 @@ export async function mapController(
|
||||
}
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
if (req.body.search) {
|
||||
// Ensure all map results are first, maintaining their order
|
||||
if (search) {
|
||||
links = [
|
||||
mapResults[0].url,
|
||||
...mapResults.slice(1).map((x) => x.url),
|
||||
{ url: mapResults[0].url, title: mapResults[0].title, description: mapResults[0].description },
|
||||
...mapResults.slice(1).map((x) => ({
|
||||
url: x.url,
|
||||
title: x.title,
|
||||
description: x.description
|
||||
})),
|
||||
...links,
|
||||
];
|
||||
} else {
|
||||
mapResults.map((x) => {
|
||||
links.push(x.url);
|
||||
mapResults.forEach((x) => {
|
||||
links.push({
|
||||
url: x.url,
|
||||
title: x.title,
|
||||
description: x.description
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Perform cosine similarity between the search query and the list of links
|
||||
if (req.body.search) {
|
||||
const searchQuery = req.body.search.toLowerCase();
|
||||
|
||||
links = performCosineSimilarity(links, searchQuery);
|
||||
if (search) {
|
||||
const filteredLinks = performCosineSimilarity(links.map(l => l.url), search.toLowerCase());
|
||||
links = links.filter(l => filteredLinks.includes(l.url));
|
||||
}
|
||||
|
||||
links = links
|
||||
.map((x) => {
|
||||
try {
|
||||
return checkAndUpdateURLForMap(x).url.trim();
|
||||
return { ...x, url: checkAndUpdateURLForMap(x.url).url.trim() };
|
||||
} catch (_) {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter((x) => x !== null);
|
||||
|
||||
// allows for subdomains to be included
|
||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||
links = links.filter((x) => isSameDomain(x.url, url));
|
||||
|
||||
// if includeSubdomains is false, filter out subdomains
|
||||
if (!req.body.includeSubdomains) {
|
||||
links = links.filter((x) => isSameSubdomain(x, req.body.url));
|
||||
if (!includeSubdomains) {
|
||||
links = links.filter((x) => isSameSubdomain(x.url, url));
|
||||
}
|
||||
|
||||
// remove duplicates that could be due to http/https or www
|
||||
links = removeDuplicateUrls(links);
|
||||
|
||||
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
||||
Logger.error(
|
||||
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
|
||||
);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
links = removeDuplicateUrls(links.map(l => l.url)).map(url => links.find(l => l.url === url));
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
||||
const linksToReturn = links.slice(0, limit);
|
||||
|
||||
logJob({
|
||||
job_id: id,
|
||||
success: links.length > 0,
|
||||
return {
|
||||
links: includeMetadata ? linksToReturn : linksToReturn.map(l => l.url),
|
||||
scrapeId: origin?.includes("website") ? id : undefined,
|
||||
timeTakenInSeconds,
|
||||
id,
|
||||
linksLength: links.length,
|
||||
linksToReturnLength: linksToReturn.length,
|
||||
docs: linksToReturn.map(l => l.url),
|
||||
};
|
||||
}
|
||||
|
||||
export async function mapController(
|
||||
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
||||
res: Response<MapResponse>
|
||||
) {
|
||||
req.body = mapRequestSchema.parse(req.body);
|
||||
|
||||
const results = await getMapResults({
|
||||
url: req.body.url,
|
||||
search: req.body.search,
|
||||
limit: req.body.limit,
|
||||
ignoreSitemap: req.body.ignoreSitemap,
|
||||
includeSubdomains: req.body.includeSubdomains,
|
||||
crawlerOptions: legacyCrawlerOptions(req.body),
|
||||
teamId: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
origin: req.body.origin,
|
||||
subId: req.acuc?.sub_id,
|
||||
});
|
||||
|
||||
await billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
|
||||
Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
|
||||
});
|
||||
|
||||
await logJob({
|
||||
job_id: results.id,
|
||||
success: results.linksLength > 0,
|
||||
message: "Map completed",
|
||||
num_docs: linksToReturn.length,
|
||||
docs: linksToReturn,
|
||||
time_taken: timeTakenInSeconds,
|
||||
num_docs: results.linksToReturnLength,
|
||||
docs: results.docs,
|
||||
time_taken: results.timeTakenInSeconds,
|
||||
team_id: req.auth.team_id,
|
||||
mode: "map",
|
||||
url: req.body.url,
|
||||
@ -183,55 +226,7 @@ export async function mapController(
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
links: linksToReturn,
|
||||
scrape_id: req.body.origin?.includes("website") ? id : undefined,
|
||||
links: results.links.map(l => l.url),
|
||||
scrape_id: results.scrapeId,
|
||||
});
|
||||
}
|
||||
|
||||
// Subdomain sitemap url checking
|
||||
|
||||
// // For each result, check for subdomains, get their sitemaps and add them to the links
|
||||
// const processedUrls = new Set();
|
||||
// const processedSubdomains = new Set();
|
||||
|
||||
// for (const result of links) {
|
||||
// let url;
|
||||
// let hostParts;
|
||||
// try {
|
||||
// url = new URL(result);
|
||||
// hostParts = url.hostname.split('.');
|
||||
// } catch (e) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
// console.log("hostParts", hostParts);
|
||||
// // Check if it's a subdomain (more than 2 parts, and not 'www')
|
||||
// if (hostParts.length > 2 && hostParts[0] !== 'www') {
|
||||
// const subdomain = hostParts[0];
|
||||
// console.log("subdomain", subdomain);
|
||||
// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
|
||||
// console.log("subdomainUrl", subdomainUrl);
|
||||
|
||||
// if (!processedSubdomains.has(subdomainUrl)) {
|
||||
// processedSubdomains.add(subdomainUrl);
|
||||
|
||||
// const subdomainCrawl = crawlToCrawler(id, {
|
||||
// originUrl: subdomainUrl,
|
||||
// crawlerOptions: legacyCrawlerOptions(req.body),
|
||||
// pageOptions: {},
|
||||
// team_id: req.auth.team_id,
|
||||
// createdAt: Date.now(),
|
||||
// plan: req.auth.plan,
|
||||
// });
|
||||
// const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
|
||||
// if (subdomainSitemap) {
|
||||
// subdomainSitemap.forEach((x) => {
|
||||
// if (!processedUrls.has(x.url)) {
|
||||
// processedUrls.add(x.url);
|
||||
// links.push(x.url);
|
||||
// }
|
||||
// });
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
@ -121,8 +121,21 @@ export const scrapeOptions = z.object({
|
||||
}).strict(strictMessage)
|
||||
|
||||
|
||||
|
||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||
|
||||
export const extractV1Options = z.object({
|
||||
urls: url.array(),
|
||||
prompt: z.string().optional(),
|
||||
schema: z.any().optional(),
|
||||
origin: z.string().optional().default("api"),
|
||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||
}).strict(strictMessage)
|
||||
|
||||
export type ExtractV1Options = z.infer<typeof extractV1Options>;
|
||||
export const extractRequestSchema = extractV1Options;
|
||||
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
|
||||
|
||||
export const scrapeRequestSchema = scrapeOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
@ -142,6 +155,8 @@ export const scrapeRequestSchema = scrapeOptions.extend({
|
||||
return obj;
|
||||
});
|
||||
|
||||
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
|
||||
export const batchScrapeRequestSchema = scrapeOptions.extend({
|
||||
@ -296,6 +311,21 @@ export interface ScrapeResponseRequestTest {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export type ExtractResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
warning?: string;
|
||||
data: Document;
|
||||
scrape_id?: string;
|
||||
};
|
||||
|
||||
export interface ExtractResponseRequestTest {
|
||||
statusCode: number;
|
||||
body: ExtractResponse;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export type CrawlResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
@ -492,3 +522,11 @@ export function legacyDocumentConverter(doc: any): Document {
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
||||
export interface MapDocument {
|
||||
url: string;
|
||||
title?: string;
|
||||
description?: string;
|
||||
}
|
119
apps/api/src/lib/extract/completions.ts
Normal file
119
apps/api/src/lib/extract/completions.ts
Normal file
@ -0,0 +1,119 @@
|
||||
import OpenAI from "openai";
|
||||
import { encoding_for_model } from "@dqbd/tiktoken";
|
||||
import { TiktokenModel } from "@dqbd/tiktoken";
|
||||
import { ExtractOptions } from "../../controllers/v1/types";
|
||||
import { Document } from "../entities";
|
||||
import { z } from "zod";
|
||||
|
||||
const maxTokens = 32000;
|
||||
const modifier = 4;
|
||||
|
||||
export class LLMRefusalError extends Error {
|
||||
constructor(refusal: string) {
|
||||
super("LLM refused to extract the website's content");
|
||||
this.name = "LLMRefusalError";
|
||||
}
|
||||
}
|
||||
|
||||
interface GenerateCompletionsParams {
|
||||
systemPrompt?: string;
|
||||
prompt?: string;
|
||||
schema?: any;
|
||||
pagesContent: string;
|
||||
}
|
||||
|
||||
export async function generateBasicCompletion(prompt: string) {
|
||||
const openai = new OpenAI();
|
||||
const model: TiktokenModel =
|
||||
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||
|
||||
const completion = await openai.chat.completions.create({
|
||||
model,
|
||||
messages: [{ role: "user", content: prompt }],
|
||||
});
|
||||
|
||||
return completion.choices[0].message.content;
|
||||
}
|
||||
|
||||
export async function generateFinalExtraction({
|
||||
pagesContent,
|
||||
systemPrompt,
|
||||
prompt,
|
||||
schema,
|
||||
}: GenerateCompletionsParams): Promise<{
|
||||
content: string;
|
||||
metadata: { numTokens: number; warning: string };
|
||||
}> {
|
||||
const openai = new OpenAI();
|
||||
const model: TiktokenModel =
|
||||
(process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
|
||||
|
||||
let extractionContent = pagesContent;
|
||||
let numTokens = 0;
|
||||
let warning = "";
|
||||
|
||||
const encoder = encoding_for_model(model);
|
||||
try {
|
||||
const tokens = encoder.encode(extractionContent);
|
||||
numTokens = tokens.length;
|
||||
} catch (error) {
|
||||
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||
warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
|
||||
} finally {
|
||||
encoder.free();
|
||||
}
|
||||
|
||||
if (numTokens > maxTokens) {
|
||||
extractionContent = extractionContent.slice(0, maxTokens * modifier);
|
||||
warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
|
||||
}
|
||||
|
||||
if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
|
||||
schema = {
|
||||
type: "object",
|
||||
properties: {
|
||||
items: schema,
|
||||
},
|
||||
required: ["items"],
|
||||
additionalProperties: false,
|
||||
};
|
||||
}
|
||||
|
||||
const jsonCompletion = await openai.beta.chat.completions.parse({
|
||||
model,
|
||||
messages: [
|
||||
{ role: "system", content: systemPrompt },
|
||||
{ role: "user", content: [{ type: "text", text: extractionContent }] },
|
||||
{
|
||||
role: "user",
|
||||
content: prompt
|
||||
? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
|
||||
: "Transform the above content into structured JSON output.",
|
||||
},
|
||||
],
|
||||
response_format: schema
|
||||
? {
|
||||
type: "json_schema",
|
||||
json_schema: {
|
||||
name: "websiteContent",
|
||||
schema: schema.shape,
|
||||
strict: true,
|
||||
},
|
||||
}
|
||||
: { type: "json_object" },
|
||||
});
|
||||
|
||||
if (jsonCompletion.choices[0].message.refusal !== null) {
|
||||
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
|
||||
}
|
||||
|
||||
const extraction = jsonCompletion.choices[0].message.parsed;
|
||||
|
||||
return {
|
||||
content: extraction,
|
||||
metadata: {
|
||||
numTokens,
|
||||
warning,
|
||||
},
|
||||
};
|
||||
}
|
22
apps/api/src/lib/extract/reranker.ts
Normal file
22
apps/api/src/lib/extract/reranker.ts
Normal file
@ -0,0 +1,22 @@
|
||||
import { CohereClient } from "cohere-ai";
|
||||
import { MapDocument } from "../../controllers/v1/types";
|
||||
const cohere = new CohereClient({
|
||||
token: process.env.COHERE_API_KEY,
|
||||
});
|
||||
|
||||
export async function rerankDocuments(
|
||||
documents: (string | Record<string, string>)[],
|
||||
query: string,
|
||||
topN = 3,
|
||||
model = "rerank-english-v3.0"
|
||||
) {
|
||||
const rerank = await cohere.v2.rerank({
|
||||
documents,
|
||||
query,
|
||||
topN,
|
||||
model,
|
||||
returnDocuments: true,
|
||||
});
|
||||
|
||||
return rerank.results.sort((a, b) => b.relevanceScore - a.relevanceScore).map(x => ({ document: x.document, index: x.index, relevanceScore: x.relevanceScore }));
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user